Friday, 19 April 2019

Unable to make use of links to fetch different titles

I've created a script in node using promise in combination with request and cheerio to parse the links under Province column from this webpage then reuse those links to scrape all the urls under Office column from all of such pages and finally make use these links to collect the title from all of such target pages, as in Cairos main Post Office in this page.

My current script most of the times gets stuck. However, sometimes it throws this error UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined. I've checked each of the functions and found that they are all working in the right way individually.

Although the script looks a bit bigger, it is built upon a very simple logic which is make use of each links from it's landing page until it reaches the title of it's target page.

This is my try so far:

const request = require('request');
const cheerio = require('cheerio');

const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';

const items = [];
const nitems = [];

let getLinks = () => {
    return new Promise((resolve, reject) => {
        request(link, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    items.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(items);
            } catch (e) {
                reject(e);
            }
        });
    });
};

let getData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    $('.table tbody tr').each(function() {
                        nitems.push(base_link + $(this).find("a").attr("href"));
                    });
                    resolve(nitems);
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

let FetchData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    resolve($(".home-title > h2").eq(0).text());
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

getLinks().then(resultList => {
    getData(resultList).then(resultSet => {
        FetchData(resultSet).then(title =>{
            console.log(title);
        })
    })
})

How can I scrape the titles from target pages making use of all the links from landing pages?



from Unable to make use of links to fetch different titles

No comments:

Post a Comment