I've created a spider to parse the link from different container from some identical site's (supplied by text file) landing page and then use the link to get the title from it's inner page. Few links have next page button which the spider handles accordingly.
The spider does parse the content but falls into an infinite loop caused by the dont_filter=True
parameter. If I don't use that parameter, the spider doesn't reuse some links that were failed to produce desired response in the first place.
I've used this parameter dont_filter=True
in three places.
- In
_retry()
method within middlewares - In the last line within the
parse()
method - In the last line within the
parse_content()
method
spider that I've created:
import os
import scrapy
import urllib
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
class YelpSpider(scrapy.Spider):
name = "yelpspidescript"
with open("all_urls.txt") as f:
start_urls = f.readlines()
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url,callback=self.parse,meta={"lead_link":url})
def parse(self,response):
if response.meta.get("lead_link"):
lead_link = response.meta.get("lead_link")
elif response.meta.get("redirect_urls"):
lead_link = response.meta.get("redirect_urls")[0]
soup = BeautifulSoup(response.text, 'lxml')
if soup.select("[class*='hoverable'] h4 a[href^='/biz/'][name]"):
for item in soup.select("[class*='hoverable'] h4 a[href^='/biz/'][name]"):
lead_link = response.urljoin(item.get("href"))
yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse_content)
next_page = soup.select_one("a[class*='next-link'][href^='/search?']")
if next_page:
link = response.urljoin(next_page.get("href"))
yield scrapy.Request(link,meta={"lead_link":link},callback=self.parse)
else:
yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse,dont_filter=True)
def parse_content(self,response):
if response.meta.get("lead_link"):
lead_link = response.meta.get("lead_link")
elif response.meta.get("redirect_urls"):
lead_link = response.meta.get("redirect_urls")[0]
soup = BeautifulSoup(response.text, 'lxml')
if soup.select_one("h1[class*='heading--inline__']"):
try:
name = soup.select_one("h1[class*='heading--inline__']").get_text(strip=True)
except AttributeError: name = ""
print(name)
else:
yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse_content,dont_filter=True)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YelpSpider)
c.start()
middlewares:
from fake_useragent import UserAgent
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 403, 401, 400, 404, 408]
class yelp_custom_Middleware(object):
ua = UserAgent()
def process_request(self, request, spider):
request.headers['User-Agent'] = self.ua.random
def process_exception(self, request, exception, spider):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
retryreq = request.copy()
retryreq.dont_filter = True
return retryreq
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in RETRY_HTTP_CODES:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
How can I let the spider not to fall into an infinite loop?
from Can't make use of dont_filter=true within a spider in the right way to avoid some unwanted activity
No comments:
Post a Comment