Raises AttributeError: Response content isn't text. I am using scrapy_proxy_pool and scrapy_user_agents. I am trying to find each and every link of target website.
import scrapy
class LinksSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.chotosite.com','chotosite.com']
extracted_links = []
def start_requests(self):
start_urls = 'https://www.chotosite.com'
yield scrapy.Request(url=start_urls, callback=self.extract_link)
def extract_link(self, response):
# eleminating images url from links
str_response_content_type = str(response.headers.get('content-type'))
if str_response_content_type == "b'text/html; charset=UTF-8'" :
links = response.xpath("//a/@href").extract()
for link in links:
if "chotosite" in link and link not in self.extracted_links:
self.extracted_links.append(link)
yield scrapy.Request(url=link, callback=self.extract_link)
yield {
"links": link
}
Here is my settings.py file
BOT_NAME = 'chotosite'
SPIDER_MODULES = ['chotosite.spiders']
NEWSPIDER_MODULE = 'chotosite.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
PROXY_POOL_ENABLED = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
}
AUTOTHROTTLE_ENABLED = True
Here is the big console output in pastebin https://pastebin.com/tRbfvxdN
And here is the traceback:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/lib/python3/dist-packages/scrapy/core/downloader/middleware.py", line 44, in process_request
defer.returnValue((yield download_func(request=request, spider=spider)))
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1362, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 https://www.chotosite.com>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/lib/python3/dist-packages/scrapy/core/downloader/middleware.py", line 53, in process_response
response = yield method(request=request, response=response, spider=spider)
File "/usr/local/lib/python3.8/dist-packages/scrapy_proxy_pool/middlewares.py", line 287, in process_response
ban = is_ban(request, response)
File "/usr/local/lib/python3.8/dist-packages/scrapy_proxy_pool/policy.py", line 15, in response_is_ban
if self.BANNED_PATTERN.search(response.text):
File "/usr/lib/python3/dist-packages/scrapy/http/response/__init__.py", line 93, in text
raise AttributeError("Response content isn't text")
AttributeError: Response content isn't text
And this is what installed on my system
Scrapy : 1.7.3
lxml : 4.5.0.0
libxml2 : 2.9.10
cssselect : 1.1.0
parsel : 1.5.2
w3lib : 1.21.0
Twisted : 18.9.0
Python : 3.8.5 (default, Jul 28 2020, 12:59:40) - [GCC 9.3.0]
pyOpenSSL : 19.0.0 (OpenSSL 1.1.1f 31 Mar 2020)
cryptography : 2.8
Platform : Linux-5.4.0-53-generic-x86_64-with-glibc2.29
from raise AttributeError: Response content isn't text Scarpy proxy pool. How to solve?
No comments:
Post a Comment