I have currently written a webscraping where I compare two values to see if there has been any increased value from previous request compare to new request.
import json
import re
import time
from dataclasses import dataclass
from typing import Optional, List
import requests
from bs4 import BeautifulSoup
@dataclass
class Product:
name: Optional[str]
price: Optional[str]
image: Optional[str]
sizes: List[str]
@staticmethod
def get_sizes(doc: BeautifulSoup) -> List[str]:
pat = re.compile(
r'^<script>var JetshopData='
r'(\{.*\})'
r';</script>$',
)
for script in doc.find_all('script'):
match = pat.match(str(script))
if match is not None:
break
else:
return []
data = json.loads(match[1])
return [
variation
for get_value in data['ProductInfo']['Attributes']['Variations']
if get_value.get('IsBuyable')
for variation in get_value['Variation']
]
@classmethod
def from_page(cls, url: str) -> Optional['Product']:
with requests.get(url) as response:
response.raise_for_status()
doc = BeautifulSoup(response.text, 'html.parser')
name = doc.select_one('h1.product-page-header')
price = doc.select_one('span.price')
image = doc.select_one('meta[property="og:image"]')
return cls(
name=name and name.text.strip(),
price=price and price.text.strip(),
image=image and image['content'],
sizes=cls.get_sizes(doc),
)
def main():
product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
previous_request = product.sizes
while True:
product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")
if set(product.sizes) - set(previous_request):
print("new changes on the webpage")
previous_request = product.sizes
else:
print("No changes made")
time.sleep(500)
if __name__ == '__main__':
main()
The problem I am facing is that there is a scenario where the product can be taken down. For example if I now I have found ['US 9,5/EUR 43', 'US 10,5/EUR 44,5']
and the webpage gets taken down by the admin where it returns 404. After few hours they re-add back the webpage and add again the values ['US 9,5/EUR 43', 'US 10,5/EUR 44,5']
- That would not print the value we already had it before on our previous valid request.
I wonder what would be the best way to print out the values if a webpage returns from 404 back to 200 (even if they add the same value?)
from How to compare variables if not http 200 status
No comments:
Post a Comment