Friday 13 November 2020

Can't fetch the title from a problematic webpage using requests implementing proxies within

I'm trying to fetch the title from a webpage. The title visible in there as BM Wendling Real Estate. The script that I've tried with sometimes can scrape it accordingly but most of the time throws 403 status. As the site bans ips, I used proxies to bypass that.

import random
import requests
from bs4 import BeautifulSoup

link = 'https://www.veteranownedbusiness.com/business/25150/bm-wendling-real-estate'
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"}

def get_proxy_list():
    r = requests.get('https://www.sslproxies.org/')
    soup = BeautifulSoup(r.text,"html.parser")
    proxies = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tr") if "yes" in item.text]
    return proxies

def get(proxies):
    proxy = proxies.pop(random.randrange(len(proxies)))
    return {'https': f'http://{proxy}','http': f'http://{proxy}'}

def scrape(url,proxy,proxies):
    while True:
        try:
            print("proxy being used: {}".format(proxy))
            r = requests.get(url, headers=headers, proxies=proxy, timeout=10)
            assert r.status_code == 200
            soup = BeautifulSoup(r.text,"html.parser")
            title = soup.select_one(".bizname_hdr > h1").get_text(strip=True)
            return title
        except Exception as e:
            proxy = get(proxies)

if __name__ == "__main__":
    proxies = get_proxy_list()
    proxy = get(proxies)
    
    title = scrape(link,proxy,proxies)
    print(title)

Question: How can I scrape the title unhindered?

Note: The site restricts it's access to few countries.



from Can't fetch the title from a problematic webpage using requests implementing proxies within

No comments:

Post a Comment