I've used a script to run selenium locally so that I can make use of the response (derived from selenium) within my spider.
This is the web service where selenium runs locally:
from flask import Flask, request, make_response
from flask_restful import Resource, Api
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
app = Flask(__name__)
api = Api(app)
class Selenium(Resource):
_driver = None
@staticmethod
def getDriver():
if not Selenium._driver:
chrome_options = Options()
chrome_options.add_argument("--headless")
Selenium._driver = webdriver.Chrome(options=chrome_options)
return Selenium._driver
@property
def driver(self):
return Selenium.getDriver()
def get(self):
url = str(request.args['url'])
self.driver.get(url)
return make_response(self.driver.page_source)
api.add_resource(Selenium, '/')
if __name__ == '__main__':
app.run(debug=True)
This is my scrapy spider which takes the benefit of that response to parse the title from a webpage. [first script -> working one]
import scrapy
from urllib.parse import quote
from scrapy.crawler import CrawlerProcess
class ProductSpider(scrapy.Spider):
name = 'products'
url = 'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40'
def start_requests(self):
link = 'http://127.0.0.1:5000/?url={}'.format(quote(self.url))
yield scrapy.Request(link,callback=self.parse)
def parse(self, response):
for item in response.css('h3.s-item__title::text').getall():
yield {"name":item}
if __name__ == '__main__':
c = CrawlerProcess()
c.crawl(ProductSpider)
c.start()
However, the problem is when I try to go another layer deep to fetch the title of products from their target pages, it doesn't work anymore: [second script -> doesn't work]
import scrapy
from urllib.parse import quote
from scrapy.crawler import CrawlerProcess
class ProductSpider(scrapy.Spider):
name = 'products'
url = 'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40'
def start_requests(self):
link = 'http://127.0.0.1:5000/?url={}'.format(quote(self.url))
yield scrapy.Request(link,callback=self.parse)
def parse(self, response):
for item in response.css('a.s-item__link::attr(href)').getall():
link = 'http://127.0.0.1:5000/?url={}'.format(quote(item))
yield scrapy.Request(link,callback=self.parse_info)
def parse_info(self, response):
item = response.css('h1#itemTitle ::text').get()
yield {"name":item}
if __name__ == '__main__':
c = CrawlerProcess()
c.crawl(ProductSpider)
c.start()
What possible chage should I bring about to make my second script work?
Currently the above script throws many similar errors like the following:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.ConnectionRefusedError: Connection was refused by other side: 10061: No connection could be made because the target machine actively refused it..
2019-05-17 22:33:52 [scrapy.core.scraper] ERROR: Error downloading <GET http://127.0.0.1:5000/?url=https%3A//www.ebay.com/itm/Paul-Joseph-Handbags-Green-Python-Genuine-Leather-Made-in-USA-7-x-13/123768401548%3Fhash%3Ditem1cd12bee8c%3Ag%3AXDgAAOSwpHtcevH0>
from Unable to make my script process locally created server response
No comments:
Post a Comment