Saturday, 19 November 2022

How to fetch two table's information from a same webpage?

I have to go to here

Here I have to choose applicant name = “ASIAN PAINTS” (as an example)

By this code, [Google Colab]

!pip install selenium
!apt-get update
!apt install chromium-chromedriver

import re
import csv
import json
from time import sleep
from typing import Generator, List, Tuple
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)

import csv
import json
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def save_to_csv(data: list) -> None:
    with open(file='ipindiaservices.csv', mode='a', encoding="utf-8") as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerow([*data])

def start_from_page(page_number: int, driver: WebDriver) -> None:
    driver.execute_script(
    f"""
    document.querySelector('button.next').value = {page_number}; 
    document.querySelector('button.next').click();
    """
    )

def titles_validation(driver: WebDriver) -> None:
    """replace empty title name with '_'"""
    driver.execute_script(
        """
        let titles = document.querySelectorAll('input+.tab-pane tr:not(:first-child)>td:last-child')
        Array.from(titles).forEach((e) => {
            if (!e.textContent.trim()) {
                e.textContent = '_';
            }
        });
        """
    )

def get_network_data(log: dict, driver: WebDriver) -> dict:
    log = json.loads(log["message"])["message"]
    if all([
        "Network.responseReceived" in log["method"], 
        "params" in log.keys(), 
        'CaptchaAudio' in str(log["params"].values())
        ]):
        return driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': log["params"]["requestId"]})

def get_captcha_text(driver: WebDriver, timeout: float) -> str:
    """Return captcha text

    Arguments:
        - driver: WebDriver
        - timeout: pause before receiving data from the web driver log
    """
    driver.execute_script(
        """
        // document.querySelector('img[title="Captcha"]').click()
        document.querySelector('img[title="Captcha  Audio"]').click()
        """
        )
    sleep(timeout)
    logs = driver.get_log('performance')
    responses = [get_network_data(log, driver) for log in logs if get_network_data(log, driver)]
    if responses:
        return json.loads(responses[0]['body'])['CaptchaImageText']
    else:
        get_captcha_text(driver, timeout)

def submit_captcha(captcha_text: str, btn_name: str) -> None:
    """Submit captcha

    Arguments:
        - btn_name: captcha send button name["submit" or "search"]
    """
    if btn_name == 'search':
        captcha_locator = (By.CSS_SELECTOR, 'input[name="submit"]')
    elif btn_name == 'submit':
        captcha_locator = (By.ID, 'btnSubmit')
    wait.until(EC.visibility_of_element_located((By.ID, 'CaptchaText'))).send_keys(captcha_text)
    wait.until(EC.visibility_of_element_located(captcha_locator)).click()

'''
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
# service = Service(executable_path="path/to/your/chromedriver.exe")
# driver = webdriver.Chrome(service=service, options=options, desired_capabilities=capabilities)
driver = webdriver.Chrome('chromedriver', chrome_options=options, desired_capabilities=capabilities)

'''

wait = WebDriverWait(driver, 15)

table_values_locator = (By.CSS_SELECTOR, 'input+.tab-pane tr:not(:first-child)>td:last-child')
applicant_name_locator = (By.ID, 'TextField6')
page_number_locator = (By.CSS_SELECTOR, 'span.Selected')
app_satus_locator = (By.CSS_SELECTOR, 'button.btn')
next_btn_locator = (By.CSS_SELECTOR, 'button.next')

driver.get('https://ipindiaservices.gov.in/PublicSearch/')

# sometimes an alert with an error message("") may appear, so a small pause is used
sleep(1)
wait.until(EC.visibility_of_element_located(applicant_name_locator)).send_keys('ltd')
# on the start page and the page with the table, the names of the buttons are different
captcha_text = get_captcha_text(driver, 1)
submit_captcha(captcha_text, "search")
# the page where the search starts
start_from_page(1, driver)

while True:
    start = time()
    # get current page number
    current_page = wait.until(EC.visibility_of_element_located(page_number_locator)).text
    print(f"Current page: {current_page}")
    # get all application status WebElements
    app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))

    for element in range(len(app_status_elements)):
        print(f"App number: {element}")
        # update application status WebElements
        app_status_elements = wait.until(EC.visibility_of_all_elements_located(app_satus_locator))
        # click on application status
        wait.until(EC.visibility_of(app_status_elements[element])).click()
        # wait 2 seconds for the captcha to change
        sleep(2)
        # get text and submit captcha
        captcha_text = get_captcha_text(driver, 1)
        submit_captcha(captcha_text, "submit")
        try:
            # get all table data values(without titles) WebElements
            table_data_values = wait.until(EC.visibility_of_all_elements_located(table_values_locator))
            # if there are empty rows in the table replace them with "_"
            titles_validation(driver)
            # save data to csv
            save_to_csv([val.text.replace('\n', ' ') for val in table_data_values])
        except TimeoutException:
            print("Application Number does not exist")
        finally:
            driver.back()
    # print the current page number to the console
    print(f"Time per page: {round(time()-start, 3)}")
    # if the current page is equal to the specified one, then stop the search and close the driver
    if current_page == '1':
        break
    # click next page
    wait.until(EC.visibility_of_element_located(next_btn_locator)).click()

driver.quit()

import pandas as pd
data = pd.read_csv('/content/ipindiaservices.csv')

df = data.set_axis(['APPLICATION NUMBER', 'APPLICATION TYPE', 'DATE OF FILING', 'APPLICANT NAME', 'TITLE OF INVENTION','FIELD OF INVENTION','E-MAIL (As Per Record)','ADDITIONAL-EMAIL (As Per Record)','E-MAIL (UPDATED Online)','PCT INTERNATIONAL APPLICATION NUMBER','PCT INTERNATIONAL FILING DATE','PRIORITY DATE','REQUEST FOR EXAMINATION DATE','PUBLICATION DATE (U/S 11A)'], axis=1, inplace=False)

df.head(2)

from google.colab import drive
drive.mount('drive')

df.to_csv('data.csv')
df.to_csv('/drive/My Drive/folder_name/name_csv_file.csv')

I am successfully able to extract this information

enter image description here

I also need to extract this table's information(yellow marked). Can it be possible?

I want to append this status into my previous csv . Can it be done modifying the existing code. TIA

enter image description here



from How to fetch two table's information from a same webpage?

No comments:

Post a Comment