Friday 28 April 2023

Multi Thread execution for webscrapping with Selenium throwing errors - Python

I have around 30k license numbers that I want to search from a website and extract all the relevant information from it When I tried the extracting the information from the function below by looping through multiple license_nums the code works fine and gives me what I am looking for

# create a UserAgent object to generate random user agents
user_agent = UserAgent()

# create a ChromeOptions object to set the user agent in the browser header
chrome_options = Options()
chrome_options.add_argument(f'user-agent={user_agent.random}')
chrome_options.add_argument("start-maximized")

# create a webdriver instance with the ChromeOptions object
driver = webdriver.Chrome(options=chrome_options,executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')

driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))

form_url = "https://cdicloud.insurance.ca.gov/cal/LicenseNumberSearch?handler=Search"
driver.get(form_url)

license_num = ['0726675', '0747600', '0691046', '0D95524', '0E77989', '0L78427']

def get_license_info(license):
    if license not in license_num:
        return pd.DataFrame()
    df_license = []
    search_box = driver.find_element('id','SearchLicenseNumber').send_keys(license)
    time.sleep(randint(15,100))
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "btnSearch"))).click()
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    table = soup.find('table', id='searchResult')
    license_name = []
    license_number =[]

    #extract all license names on the page
    # Collecting Ddata
    for row in table.tbody.find_all('tr'):    
        # Find all data for each column
        columns = row.find_all('td')

        if(columns != []):
            l_name = columns[0].text.strip().replace("\t"," ")
            license_name.append(l_name)
            license_number.append(columns[1].text.strip())
            print(l_name)
    for row in range(0, len(license_name)):      
            first_page_handle = driver.current_window_handle
            time.sleep(5)
    
            WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, f"//table[@id='searchResult']/tbody/tr[{row+1}]/td[2]/a"))).click()
            try:
                driver.switch_to.window(driver.window_handles[1])
                html = driver.page_source
                soup = BeautifulSoup(html, "lxml")
                #Grab license type and Expiration date
                table_l = soup.find('table', id='licenseDetailGrid')
                data = []
                for tr in table_l.find_all('tr'):
                    row = [td.text for td in tr.find_all('td')]
                    data.append(row)
                df1 = pd.DataFrame(data, columns=['license_type','original_issue_date','status','status_date','exp_date'])
                time.sleep(5)
                business = soup.find("div",id="collapse-LicenseDetailSection").extract()
                b_list = list(business.stripped_strings)
                df_final = df1[df1['license_type'].str.contains("Accident",na=False)]
                df_final = df_final.assign(license_type=df_final['license_type'].str.extract('(.*)\n'))
                df_final['license_name'] = l_name
                df_final['license_number'] = license
                df_license.append(df_final)
                driver.close()
                driver.switch_to.window(first_page_handle)
            except NoSuchWindowException:
                    print("Window closed, skipping to next license")


    driver.find_element('id','SearchLicenseNumber').clear()
    time.sleep(5)

    return pd.concat(df_license)

when I try to put it run with multi thread it doesn't show the value in the search field and throws error

approach 1 from (Scraping multiple webpages at once with Selenium)

with futures.ThreadPoolExecutor() as executor:     
    # store the url for each thread as a dict, so we can know which thread fails
    future_results = {license: executor.submit(get_license_info, license) for license in license_num}
    
    for license, future in future_results.items(): 
        try:
            df_license = pd.concat([f.result() for f in future_results.values()])
        except Exception as exc:
            print('An exception occurred: {}'.format(exc))

approach 2 from (How to run `selenium-chromedriver` in multiple threads)

start_time = time.time()    
threads = [] 
for license in license_num: # each thread could be like a new 'click' 
    th = threading.Thread(target=get_license_info, args=(license,))    
    th.start() # could `time.sleep` between 'clicks' to see whats'up without headless option
    threads.append(th)        
for th in threads:
    th.join() # Main thread wait for threads finish
print("multiple threads took ", (time.time() - start_time), " seconds")

Can anybody help me with this. Thank you in advance



from Multi Thread execution for webscrapping with Selenium throwing errors - Python

No comments:

Post a Comment