Wednesday, 13 October 2021

Failed to parse content from a webpage using requests

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.

Here goes the manual steps to reach the content:

  1. Choose the first item from dropdown.
  2. Get the links to the detail page.
  3. Grab these two fields from detail page.

While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.

Script without session:

import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def grab_first_link_from_dropdown(link):
    r = requests.get(link,headers=headers)
    soup = BeautifulSoup(r.text,"html.parser")
    category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
    return category_link

def fetch_detail_page_link(cat_link):
    res = requests.get(cat_link,headers=headers)
    str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
    soup = BeautifulSoup(res.text,"html.parser")
    for items in soup.select("table.list-table > tbody.list-tbody > tr"):
        target_link = items.select_one("a.detailLink").get("onclick")
        detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
        inner_link = vigen_detail_page.format(detail_num)
        yield str_cookie,inner_link

def get_content(str_cookie,inner_link):
    headers['Cookie'] = str_cookie
    res = requests.get(inner_link,headers=headers)
    soup = BeautifulSoup(res.text,"html.parser")
    try:
        expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
    except AttributeError: expediente = ""
    try:
        descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
    except AttributeError: descripcion = ""
    return expediente,descripcion

if __name__ == '__main__':
    category_link = grab_first_link_from_dropdown(link)
    for cookie,detail_page_link in fetch_detail_page_link(category_link):
        print(get_content(cookie,detail_page_link))

What possible change should I bring about to make the script work?



from Failed to parse content from a webpage using requests

No comments:

Post a Comment