I'm trying to fetch the title from a webpage. The title visible in there as BM Wendling Real Estate. The script that I've tried with sometimes can scrape it accordingly but most of the time throws 403 status. As the site bans ips, I used proxies to bypass that.
import random
import requests
from bs4 import BeautifulSoup
link = 'https://www.veteranownedbusiness.com/business/25150/bm-wendling-real-estate'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"}
def get_proxy_list():
r = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(r.text,"html.parser")
proxies = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tr") if "yes" in item.text]
return proxies
def get(proxies):
proxy = proxies.pop(random.randrange(len(proxies)))
return {'https': f'http://{proxy}','http': f'http://{proxy}'}
def scrape(url,proxy,proxies):
while True:
try:
print("proxy being used: {}".format(proxy))
r = requests.get(url, headers=headers, proxies=proxy, timeout=10)
assert r.status_code == 200
soup = BeautifulSoup(r.text,"html.parser")
title = soup.select_one(".bizname_hdr > h1").get_text(strip=True)
return title
except Exception as e:
proxy = get(proxies)
if __name__ == "__main__":
proxies = get_proxy_list()
proxy = get(proxies)
title = scrape(link,proxy,proxies)
print(title)
Question: How can I scrape the title unhindered?
Note: The site restricts it's access to few countries.
from Can't fetch the title from a problematic webpage using requests implementing proxies within
No comments:
Post a Comment