I'm trying to fetch the title from a webpage. The title visible in there as BM Wendling Real Estate
. The script that I've tried with sometimes can scrape it accordingly but most of the time throws 403 status. As the site bans ips, I used proxies to bypass that.
import random
import requests
from bs4 import BeautifulSoup
link = 'https://www.veteranownedbusiness.com/business/25150/bm-wendling-real-estate'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"}
def get_proxy_list():
r = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(r.text,"html.parser")
proxies = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tr") if "yes" in item.text]
return proxies
def get(proxies):
proxy = proxies.pop(random.randrange(len(proxies)))
return {'https': f'http://{proxy}','http': f'http://{proxy}'}
def scrape(url,proxy,proxies):
while True:
try:
print("proxy being used: {}".format(proxy))
r = requests.get(url, headers=headers, proxies=proxy, timeout=10)
assert r.status_code == 200
soup = BeautifulSoup(r.text,"html.parser")
title = soup.select_one(".bizname_hdr > h1").get_text(strip=True)
return title
except Exception as e:
proxy = get(proxies)
if __name__ == "__main__":
proxies = get_proxy_list()
proxy = get(proxies)
title = scrape(link,proxy,proxies)
print(title)
Question: How can I scrape the title unhindered?
Note: The site restricts it's access to few countries.
from Can't fetch the title from a problematic webpage using requests implementing proxies within
No comments:
Post a Comment