I'm working on a web scraper and I am having trouble with grabbing the correct selector. Here's my code:
# -*- coding: utf-8 -*-
import scrapy
import pandas as pd
from ..items import HomedepotpricespiderItem
from scrapy.http import Request
class HomedepotspiderSpider(scrapy.Spider):
name = 'homeDepotSpider'
allowed_domains = ['homedepot.com']
start_urls = ['https://www.homedepot.com/pep/304660691']#.format(omsID = omsID)
#for omsID in omsList]
def parse(self, response):
#call home depot function
for item in self.parseHomeDepot(response):
yield item
pass
def parseHomeDepot(self, response):
#get top level item
items = response.css('#zone-a-product')
for product in items:
item = HomedepotpricespiderItem()
#get the price
productPrice = product.xpath('//div[@class="price-format__main-price"]/span/text()').getall()
#get rid of all the stuff i dont need
item['productPrice'] = productPrice
yield item
So with my current selector it looks like it is grabbing the price of these items.
because my output is:
'productPrice': ['$',
'2167',
'49',
'$',
'1798',
'00',
'$',
'2698',
'00',
'$',
'2099',
'99',
'$',
'2968',
'00',
'$',
'2294',
'99',
'$',
'2068',
'00',
'$',
'1649',
'99',
'$',
'2399',
'00',
'$',
'1649',
'99',
'$',
'1549',
'99',
'$',
'1799',
'99',
'$',
'3360',
'89',
'$',
'2899',
'95',
'$',
'3699',
'00',
'$',
'2719',
'96',
'$',
'1954',
'99',
'$',
'2699',
'00',
'$',
'2294',
'96',
'$',
'3149',
'00',
'$',
'3499',
'00',
'$',
'3749',
'00',
'$',
'4999',
'00',
'$',
'2799',
'99'],
when the correct output should be: 2099
Additionally, I don't think that my selector is even grabbing the price of the item at all.
from Scrapy - Correct Selector for dynamically created field
No comments:
Post a Comment