I have around 30k license numbers that I want to search from a website and extract all the relevant information from it When I tried the extracting the information from the function below by looping through multiple license_nums the code works fine and gives me what I am looking for
# create a UserAgent object to generate random user agents
user_agent = UserAgent()
# create a ChromeOptions object to set the user agent in the browser header
chrome_options = Options()
chrome_options.add_argument(f'user-agent={user_agent.random}')
chrome_options.add_argument("start-maximized")
# create a webdriver instance with the ChromeOptions object
driver = webdriver.Chrome(options=chrome_options,executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
form_url = "https://cdicloud.insurance.ca.gov/cal/LicenseNumberSearch?handler=Search"
driver.get(form_url)
license_num = ['0726675', '0747600', '0691046', '0D95524', '0E77989', '0L78427']
def get_license_info(license):
if license not in license_num:
return pd.DataFrame()
df_license = []
search_box = driver.find_element('id','SearchLicenseNumber').send_keys(license)
time.sleep(randint(15,100))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "btnSearch"))).click()
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
table = soup.find('table', id='searchResult')
license_name = []
license_number =[]
#extract all license names on the page
# Collecting Ddata
for row in table.tbody.find_all('tr'):
# Find all data for each column
columns = row.find_all('td')
if(columns != []):
l_name = columns[0].text.strip().replace("\t"," ")
license_name.append(l_name)
license_number.append(columns[1].text.strip())
print(l_name)
for row in range(0, len(license_name)):
first_page_handle = driver.current_window_handle
time.sleep(5)
WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, f"//table[@id='searchResult']/tbody/tr[{row+1}]/td[2]/a"))).click()
try:
driver.switch_to.window(driver.window_handles[1])
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
#Grab license type and Expiration date
table_l = soup.find('table', id='licenseDetailGrid')
data = []
for tr in table_l.find_all('tr'):
row = [td.text for td in tr.find_all('td')]
data.append(row)
df1 = pd.DataFrame(data, columns=['license_type','original_issue_date','status','status_date','exp_date'])
time.sleep(5)
business = soup.find("div",id="collapse-LicenseDetailSection").extract()
b_list = list(business.stripped_strings)
df_final = df1[df1['license_type'].str.contains("Accident",na=False)]
df_final = df_final.assign(license_type=df_final['license_type'].str.extract('(.*)\n'))
df_final['license_name'] = l_name
df_final['license_number'] = license
df_license.append(df_final)
driver.close()
driver.switch_to.window(first_page_handle)
except NoSuchWindowException:
print("Window closed, skipping to next license")
driver.find_element('id','SearchLicenseNumber').clear()
time.sleep(5)
return pd.concat(df_license)
when I try to put it run with multi thread it doesn't show the value in the search field and throws error
approach 1 from (Scraping multiple webpages at once with Selenium)
with futures.ThreadPoolExecutor() as executor:
# store the url for each thread as a dict, so we can know which thread fails
future_results = {license: executor.submit(get_license_info, license) for license in license_num}
for license, future in future_results.items():
try:
df_license = pd.concat([f.result() for f in future_results.values()])
except Exception as exc:
print('An exception occurred: {}'.format(exc))
approach 2 from (How to run `selenium-chromedriver` in multiple threads)
start_time = time.time()
threads = []
for license in license_num: # each thread could be like a new 'click'
th = threading.Thread(target=get_license_info, args=(license,))
th.start() # could `time.sleep` between 'clicks' to see whats'up without headless option
threads.append(th)
for th in threads:
th.join() # Main thread wait for threads finish
print("multiple threads took ", (time.time() - start_time), " seconds")
Can anybody help me with this. Thank you in advance
from Multi Thread execution for webscrapping with Selenium throwing errors - Python
No comments:
Post a Comment