Thursday, 11 October 2018

Can't scrape and print the links on the fly

I've written a script in node.js to scrape the links of different titles from a webpage. When I execute my following script, I get undefined printed in the console instead of the links I'm after. My defined selectors are accurate.

I do not wish to put the links in an array and return the results; rather, I wish to print them on the fly. As I'm very new to write scripts using node.js in combination with puppeteer, I can't figure out the mistake I'm making.

This is my script (Link to that site):

const puppeteer = require('puppeteer');
function run () {
    return new Promise(async (resolve, reject) => {
        try {
            const browser = await puppeteer.launch();
            const page = await browser.newPage();
            await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
            let url = await page.evaluate(() => {
                let items = document.querySelectorAll('a.question-hyperlink');
                items.forEach((item) => {
                    //would like to keep the following line intact 
                    console.log(item.getAttribute('href'));
                });
            })
            browser.close();
            return resolve(url);
        } catch (e) {
            return reject(e);
        }
    })
}
run().then(console.log).catch(console.error);

The following script works just fine if I consider to declare an empty array results and store the scraped links within it and finally return the results.

const puppeteer = require('puppeteer');
function run () {
    return new Promise(async (resolve, reject) => {
        try {
            const browser = await puppeteer.launch();
            const page = await browser.newPage();
            await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
            let urls = await page.evaluate(() => {
                let results = [];
                let items = document.querySelectorAll('a.question-hyperlink');
                items.forEach((item) => {
                    results.push({
                        url:  item.getAttribute('href'),
                    });
                });
                return results;
            })
            browser.close();
            return resolve(urls);
        } catch (e) {
            return reject(e);
        }
    })
}
run().then(console.log).catch(console.error);

Once again: my question is how can I print the link like console.log(item.getAttribute('href')); on the fly without storing it in an array?



from Can't scrape and print the links on the fly

No comments:

Post a Comment