Sunday, 18 July 2021

How to compare variables if not http 200 status

I have currently written a webscraping where I compare two values to see if there has been any increased value from previous request compare to new request.

import json
import re
import time
from dataclasses import dataclass
from typing import Optional, List

import requests
from bs4 import BeautifulSoup


@dataclass
class Product:
    name: Optional[str]
    price: Optional[str]
    image: Optional[str]
    sizes: List[str]

    @staticmethod
    def get_sizes(doc: BeautifulSoup) -> List[str]:
        pat = re.compile(
            r'^<script>var JetshopData='
            r'(\{.*\})'
            r';</script>$',
        )
        for script in doc.find_all('script'):
            match = pat.match(str(script))
            if match is not None:
                break
        else:
            return []

        data = json.loads(match[1])
        return [
            variation
            for get_value in data['ProductInfo']['Attributes']['Variations']
            if get_value.get('IsBuyable')
            for variation in get_value['Variation']
        ]

    @classmethod
    def from_page(cls, url: str) -> Optional['Product']:
        with requests.get(url) as response:
            response.raise_for_status()
            doc = BeautifulSoup(response.text, 'html.parser')

        name = doc.select_one('h1.product-page-header')
        price = doc.select_one('span.price')
        image = doc.select_one('meta[property="og:image"]')

        return cls(
            name=name and name.text.strip(),
            price=price and price.text.strip(),
            image=image and image['content'],
            sizes=cls.get_sizes(doc),
        )


def main():
    product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")

    previous_request = product.sizes

    while True:
        product = Product.from_page("https://shelta.se/sneakers/nike-air-zoom-type-whiteblack-cj2033-103")

        if set(product.sizes) - set(previous_request):
            print("new changes on the webpage")
            previous_request = product.sizes

        else:
            print("No changes made")

        time.sleep(500)


if __name__ == '__main__':
    main()

The problem I am facing is that there is a scenario where the product can be taken down. For example if I now I have found ['US 9,5/EUR 43', 'US 10,5/EUR 44,5'] and the webpage gets taken down by the admin where it returns 404. After few hours they re-add back the webpage and add again the values ['US 9,5/EUR 43', 'US 10,5/EUR 44,5']- That would not print the value we already had it before on our previous valid request.

I wonder what would be the best way to print out the values if a webpage returns from 404 back to 200 (even if they add the same value?)



from How to compare variables if not http 200 status

No comments:

Post a Comment