Monday, 31 August 2020

Can't make use of dont_filter=true within a spider in the right way to avoid some unwanted activity

I've created a spider to parse the link from different container from some identical site's (supplied by text file) landing page and then use the link to get the title from it's inner page. Few links have next page button which the spider handles accordingly.

The spider does parse the content but falls into an infinite loop caused by the dont_filter=True parameter. If I don't use that parameter, the spider doesn't reuse some links that were failed to produce desired response in the first place.

I've used this parameter dont_filter=True in three places.

  1. In _retry() method within middlewares
  2. In the last line within the parse() method
  3. In the last line within the parse_content() method

spider that I've created:

import os
import scrapy
import urllib
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess


class YelpSpider(scrapy.Spider):
    name = "yelpspidescript"

    with open("all_urls.txt") as f:
        start_urls = f.readlines()
   
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,callback=self.parse,meta={"lead_link":url})

    def parse(self,response):
        if response.meta.get("lead_link"):
            lead_link = response.meta.get("lead_link")
        elif response.meta.get("redirect_urls"):
            lead_link = response.meta.get("redirect_urls")[0]

        soup = BeautifulSoup(response.text, 'lxml')
        if soup.select("[class*='hoverable'] h4 a[href^='/biz/'][name]"):
            for item in soup.select("[class*='hoverable'] h4 a[href^='/biz/'][name]"):
                lead_link = response.urljoin(item.get("href"))
                yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse_content)

            next_page = soup.select_one("a[class*='next-link'][href^='/search?']")
            if next_page:
                link = response.urljoin(next_page.get("href"))
                yield scrapy.Request(link,meta={"lead_link":link},callback=self.parse)

        else:
            yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse,dont_filter=True)
            
    def parse_content(self,response):
        if response.meta.get("lead_link"):
            lead_link = response.meta.get("lead_link")
        elif response.meta.get("redirect_urls"):
            lead_link = response.meta.get("redirect_urls")[0]

        soup = BeautifulSoup(response.text, 'lxml')

        if soup.select_one("h1[class*='heading--inline__']"):
            try:
                name = soup.select_one("h1[class*='heading--inline__']").get_text(strip=True)
            except AttributeError: name = ""
            print(name)

        else:
            yield scrapy.Request(lead_link,meta={"lead_link":lead_link},callback=self.parse_content,dont_filter=True)
            

if __name__ == "__main__":
    c = CrawlerProcess({
        'USER_AGENT':'Mozilla/5.0',
        'LOG_LEVEL':'ERROR',
    })
    c.crawl(YelpSpider)
    c.start()

middlewares:

from fake_useragent import UserAgent


RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 403, 401, 400, 404, 408]

class yelp_custom_Middleware(object):
    ua = UserAgent() 

    def process_request(self, request, spider):
        request.headers['User-Agent'] = self.ua.random

    def process_exception(self, request, exception, spider):
        return self._retry(request, exception, spider)

    def _retry(self, request, reason, spider):
        retryreq = request.copy()
        retryreq.dont_filter = True
        return retryreq

    def process_response(self, request, response, spider):
        if request.meta.get('dont_retry', False):
            return response
        if response.status in RETRY_HTTP_CODES:
            reason = response_status_message(response.status)
            return self._retry(request, reason, spider) or response
        return response

How can I let the spider not to fall into an infinite loop?

EDIT: I thought to include few of the urls I'm trying with which are within all_urls.txt file in case it helps identify the issue better.



from Can't make use of dont_filter=true within a spider in the right way to avoid some unwanted activity

No comments:

Post a Comment