Context
I'm trying to get all the data from this website in order to later use it in some model training project (ML).
I've chosen to do it by using Scrapy + Python 3.7. So far so good. I've set up my Scrapy project structure and I started working on the scraper. In order to do this, I created some steps that need to be followed in order to accordingly get the data that I need.
Steps
- First of all, we can see that when accessing the site's sitemap we can get all the categories that we need. (There's also a direct Products page, but unfortunately there's no way to get the category this way, so that's not a solution).
- Now, what we need to do is to access each sub-category, which will lead us to the Products page (where the infinite loading is). I've taken the first subcategory as an example.
- As we scroll down through the products, we can see that we have an infinite loading and a request is being made to get more products into the frontend:
- And finally, click on each product and get some data from it (This part is irrelevant for what I'm asking so you can skip the
Productclass from the code that I'll paste below)
Code
I've tried to reproduce the above by using the following piece of code:
import json
import re
import scrapy
PRODUCTS_XPATH = "//div[@class='col-md-3']//a/@href"
class Product:
def __init__(self, response):
self.response = response
def get_brand_name(self):
brand_name = self.response.xpath(
"normalize-space(//*[@class='product-brand-name-details']/text())"
).extract()
if not brand_name[0]:
brand_name = self.response.xpath(
"normalize-space(//h3[@class='font-weight-bold']/text())"
).extract()
return brand_name[0] if brand_name else 'Could not get product brand name.'
def get_brand_name_details(self):
brand_name_details = self.response.xpath(
"normalize-space(//*[@class='product-name-details']/text())"
).extract()
if not brand_name_details[0]:
brand_name_details = self.response.xpath(
"normalize-space(//h1[@class='title font-weight-bold']/text())"
).extract()
return brand_name_details[0] if brand_name_details else 'Could not get product brand name details.'
def get_real_category(self):
return self.response.meta.get('product_category')
def get_sku_details(self):
sku_details = self.response.xpath(
"normalize-space(//*[@class='product-sku-details']/text())"
).extract()
if not sku_details[0]:
sku_details = self.response.xpath(
"normalize-space(//h5[@class='font-weight-bold']/text())"
).extract()
return sku_details[0] if sku_details else 'Could not get product sku details.'
def get_short_desc_details(self):
short_desc_details = self.response.xpath(
"normalize-space(//p[@class='pt-2']/text())"
).extract()
return short_desc_details[0] if short_desc_details else 'Could not get product short desc details.'
def get_detail_list_price(self):
detail_list_price = self.response.xpath(
"normalize-space(//*[@class='product-detail-list-price']//text())"
).extract()
return detail_list_price[0] if detail_list_price else 'Could not get product detail list price.'
def get_price(self):
price = self.response.xpath(
"normalize-space(//*[@class='price']//text())"
).extract()
return price[0] if price else 'Could not get product price.'
def get_detail_price_save(self):
detail_price_save = self.response.xpath(
"normalize-space(//*[@class='product-detail-price-save']//text())"
).extract()
return detail_price_save[0] if detail_price_save else 'Could not get product detail price save.'
def get_detail_note(self):
detail_note = self.response.xpath(
"normalize-space(//*[@class='product-detail-note']//text())"
).extract()
return detail_note[0] if detail_note else 'Could not get product detail note.'
def get_detail_long_desc(self):
detail_long_descriptions = self.response.xpath(
"//*[@id='desc']/node()"
).extract()
detail_long_desc = ''.join([x.strip() for x in detail_long_descriptions if x.strip()])
return detail_long_desc if detail_long_desc else 'Could not get product detail long desc.'
def get_image(self):
image = self.response.xpath(
"normalize-space(//*[@id='mainContent_imgDetail']/@src)"
).extract()
return f'https://bannersolutions.com{image[0]}' if image else 'Could not get product image.'
def get_pieces_in_stock(self):
pieces_in_stock = self.response.xpath(
"normalize-space(//*[@class='badge-success']//text())"
).extract()
return pieces_in_stock[0] if pieces_in_stock else 'Unknown pieces in stock.'
def get_meta_description(self):
meta_description = self.response.xpath(
"normalize-space(//*[@name='description']/@content)"
).extract()
return meta_description[0] if meta_description else 'Could not get product meta description.'
def to_json(self):
return {
'product_brand_name_details': self.get_brand_name_details(),
'product_brand_name': self.get_brand_name(),
'product_category': self.get_real_category(),
'product_sku_details': self.get_sku_details(),
'product_short_desc_details': self.get_short_desc_details(),
'product_detail_list_price': self.get_detail_list_price(),
'product_price': self.get_price(),
'product_detail_price_save': self.get_detail_price_save(),
'product_detail_note': self.get_detail_note(),
'product_detail_long_desc': self.get_detail_long_desc(),
'product_image': self.get_image(),
'product_in_stock': self.get_pieces_in_stock(),
'product_meta_description': self.get_meta_description()
}
class BannerSolutionsSpider(scrapy.Spider):
name = 'bannersolutions'
start_urls = ['https://bannersolutions.com/Sitemap']
allowed_domains = ['bannersolutions.com']
def start_crawl(self, response):
for url in self.start_urls:
yield scrapy.Request(url)
def parse(self, response):
for category in response.xpath('(//div[@class="col-md-3"])[1]/ul/li'):
main_category_name = category.xpath('./a/text()').get()
sub_category_name = category.xpath('./ul/li/a/text()').get()
category_url = category.xpath('./ul/li/a/@href').get()
if category_url:
yield scrapy.Request(f'https://bannersolutions.com{category_url}', callback=self.parse_categories,
meta={'product_category': f'{main_category_name}/{sub_category_name}'})
def parse_categories(self, response):
title = response.xpath('//h1[@class="title"]/text()').get()
products_in_category = re.match(r'.*\((\d+)\)', title).group(1)
no_of_requests = int(products_in_category) // 8 + 1
in_cat_id = response.url.split('/')[-1]
for i in range(1, no_of_requests):
payload = {
'pageIndex': str(i),
'inViewType': 'grid',
'inPageSize': '8',
'inCatID': in_cat_id,
'inFilters': '',
'inSortType': ''
}
yield scrapy.Request(
'https://bannersolutions.com/catalog.aspx/GetProducts',
method='POST',
headers={"content-type": "application/json"},
body=json.dumps(payload),
callback=self.parse_plm,
meta={'product_category': response.meta.get('product_category')}
)
def parse_plm(self, response):
products_str_html = json.loads(response.body).get('d')
product_url = scrapy.selector.Selector(text=products_str_html).xpath(
'//div[@class="product-image-container"]//a/@href'
).get()
yield scrapy.Request(
f'https://bannersolutions.com{product_url}',
callback=self.parse_product,
meta={'product_category': response.meta.get('product_category')}
)
def parse_product(self, response):
product = Product(response).to_json()
yield product
Issues
The issue with my code is that not all the products are being parsed, only ~3k / out of 70k. Now, Where I suppose it's the issue is between the lines 148-165. I've ran it through the debugger but I still couldn't figure out what's wrong.
Can someone please explain me what's wrong in my code logic?
from Understanding infinite loading when using Scrapy - what's wrong?



No comments:
Post a Comment