I've written a script in python's scrapy to make a proxied requests using either of the newly generated proxies by get_proxies()
method. I used requests
module to fetch the proxies in order to reuse them in the script. However, the problem is the proxy my script chooses to use may not be the good one always so sometimes it doesn't fetch valid response.
How can I let my script keep trying with different proxies until there is a valid response?
My script so far:
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.http.request import Request
from scrapy.crawler import CrawlerProcess
class ProxySpider(scrapy.Spider):
name = "sslproxies"
check_url = "https://stackoverflow.com/questions/tagged/web-scraping"
proxy_link = "https://www.sslproxies.org/"
def start_requests(self):
proxylist = self.get_proxies()
random.shuffle(proxylist)
proxy_ip_port = next(cycle(proxylist))
print(proxy_ip_port) #Checking out the proxy address
request = scrapy.Request(self.check_url, callback=self.parse,errback=self.errback_httpbin,dont_filter=True)
request.meta['proxy'] = "http://{}".format(proxy_ip_port)
yield request
def get_proxies(self):
response = requests.get(self.proxy_link)
soup = BeautifulSoup(response.text,"lxml")
proxy = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tbody tr") if "yes" in item.text]
return proxy
def parse(self, response):
print(response.meta.get("proxy")) #Compare this to the earlier one whether they both are the same
def errback_httpbin(self, failure):
print("Failure: "+str(failure))
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'DOWNLOAD_TIMEOUT' : 5,
})
c.crawl(ProxySpider)
c.start()
PS My intension is to seek any solution the way I've started here.
from Unable to use proxies one by one until there is a valid response
No comments:
Post a Comment