I have to go to here
Here I have to choose applicant name = “ASIAN PAINTS” (as an example)
By this code, [Google Colab]
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
def save_to_csv(data: list) -> None:
with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([*data])
def start_from_page(page_number: int, driver: WebDriver) -> None:
driver.execute_script(
f"""
document.querySelector('button.next').value = {page_number};
document.querySelector('button.next').click();
"""
)
def titles_validation(driver: WebDriver) -> None:
"""replace empty title name with '_'"""
driver.execute_script(
"""
let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
Array.from(titles).forEach((e) => {
if (!e.textContent.trim()) {
e.textContent = '_';
}
});
"""
)
def get_network_data(log: dict, driver: WebDriver) -> dict:
log = json.loads(log["message"])["message"]
if all([
"Network.responseReceived" in log["method"],
"params" in log.keys(),
'CaptchaAudio' in str(log["params"].values())
]):
return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})
def get_captcha_text(driver: WebDriver, timeout: float) -> str:
"""Return captcha text
Arguments:
- driver: WebDriver
- timeout: pause before receiving data from the web driver log
"""
driver.execute_script(
"""
// document.querySelector('img[title="Captcha"]').click()
document.querySelector('img[title="Captcha Audio"]').click()
"""
)
sleep(timeout)
logs = driver.get_log('performance')
responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
if responses:
return json.loads(responses[0]['body'])['CaptchaImageText']
else:
get_captcha_text(driver, timeout)
def submit_captcha(captcha_text: str, btn_name: str) -> None:
"""Submit captcha
Arguments:
- btn_name: captcha send button name["submit" or "search"]
"""
if btn_name == 'search':
captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
elif btn_name == 'submit':
captcha_locator = (By.ID, 'btnSubmit')
wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
wait.until(EC.visibility_of_element_located(captcha_locator)).click()
'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)
'''
wait = WebDriverWait(driver, 15)
table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')
driver.get('https://ipindiaservices.gov.in/PublicSearch/')
# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)
while True:
start = time()
# get current page number
current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
print(f"Current page: {current_page}")
# get all application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
for element in range(len(app_status_elements)):
print(f"App number: {element}")
# update application status WebElements
app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
# click on application status
wait.until(EC.visibility_of(app_status_elements[element])).click()
# wait 2 seconds for the captcha to change
sleep(2)
# get text and submit captcha
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "submit")
try:
# get all table data values(without titles) WebElements
table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
# if there are empty rows in the table replace them with "_"
titles_validation(driver)
# save data to csv
save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
except TimeoutException:
print("Application Number does not exist")
finally:
driver.back()
# print the current page number to the console
print(f"Time per page: {round(time()-start, 3)}")
# if the current page is equal to the specified one, then stop the search and close the driver
if current_page == '1':
break
# click next page
wait.until(EC.visibility_of_element_located(next_btn_locator)).click()
driver.quit()
import pandas as pd
data = pd.read_csv('/content/ipindiaservices.csv')
df = data.set_axis(['APPLICATION NUMBER', 'APPLICATION TYPE', 'DATE OF FILING', 'APPLICANT NAME', 'TITLE OF INVENTION','FIELD OF INVENTION','E-MAIL (As Per Record)','ADDITIONAL-EMAIL (As Per Record)','E-MAIL (UPDATED Online)','PCT INTERNATIONAL APPLICATION NUMBER','PCT INTERNATIONAL FILING DATE','PRIORITY DATE','REQUEST FOR EXAMINATION DATE','PUBLICATION DATE (U/S 11A)'], axis=1, inplace=False)
df.head(2)
from google.colab import drive
drive.mount('drive')
df.to_csv('data.csv')
df.to_csv('/drive/My Drive/folder_name/name_csv_file.csv')
I am successfully able to extract this information
I also need to extract this table's information(yellow marked). Can it be possible?
I want to append this status into my previous csv . Can it be done modifying the existing code. TIA
from How to fetch two table's information from a same webpage?


No comments:
Post a Comment