Thursday, 15 July 2021

Python execute script using multiple browsers Selenium

How can I execute the below script using multiple browsers?

Every n urls should be executed using a separate browser. I should be able to define the value of n (parallel scraping)

import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver

browser = webdriver.Chrome()

class GameData:

    def __init__(self):
        self.date = []
        self.time = []
        self.game = []
        self.score = []
        self.home_odds = []
        self.draw_odds = []
        self.away_odds = []
        self.country = []
        self.league = []

def parse_data(url):
    while True:
        try:
            browser.get(url)
            df = pd.read_html(browser.page_source)[0]
            break
        except KeyError:
            browser.quit()
            continue
    html = browser.page_source
    soup = bs(html, "lxml")
    cont = soup.find('div', {'id': 'wrap'})
    content = cont.find('div', {'id': 'col-content'})
    content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
    main = content.find('th', {'class': 'first2 tl'})
    if main is None:
        return None
    count = main.findAll('a')
    country = count[1].text
    league = count[2].text
    game_data = GameData()
    game_date = None
    for row in df.itertuples():
        if not isinstance(row[1], str):
            continue
        elif ':' not in row[1]:
            game_date = row[1].split('-')[0]
            continue
        game_data.date.append(game_date)
        game_data.time.append(row[1])
        game_data.game.append(row[2])
        game_data.score.append(row[3])
        game_data.home_odds.append(row[4])
        game_data.draw_odds.append(row[5])
        game_data.away_odds.append(row[6])
        game_data.country.append(country)
        game_data.league.append(league)
    return game_data

# URLs go here
urls = {
    "https://www.oddsportal.com/soccer/world/international-champions-cup/results/#/",
    "https://www.oddsportal.com/soccer/romania/superliga-women/results/#/",
    "https://www.oddsportal.com/soccer/portugal/league-cup/results/#/",
    "https://www.oddsportal.com/soccer/world/valentin-granatkin-memorial/results/#/",
    "https://www.oddsportal.com/soccer/slovenia/prva-liga/results/#/",
    "https://www.oddsportal.com/soccer/brazil/campeonato-pernambucano/results/#/",
    "https://www.oddsportal.com/soccer/netherlands/eredivisie-cup-women/results/#/",
    "https://www.oddsportal.com/soccer/singapore/premier-league/results/#/",
    "https://www.oddsportal.com/soccer/world/world-cup-women-u20/results/#/",
    "https://www.oddsportal.com/soccer/world/premier-league-asia-trophy/results/#/",
}

if __name__ == '__main__':

    results = None

    for url in urls:
        try:
            game_data = parse_data(url)
            if game_data is None:
                continue
            result = pd.DataFrame(game_data.__dict__)
            if results is None:
                results = result
            else:
                results = results.append(result, ignore_index=True)

print(results)

Currently the script uses one browser window for all urls

How can I modify the code to open multiple browser incidents for every n urls to do the same job faster and then append into results.



from Python execute script using multiple browsers Selenium

No comments:

Post a Comment