0.0.7 • Published 7 years ago
xvi-phantom-scraper v0.0.7
xvi-phantom-scraper
A scraper based on phantom package, including cheerio for easy parsing.
Example
//Add package
const Scraper = require('xvi-phantom-scraper');
//Create instance of the scraper
var scraper = new Scraper({
sources: [{ //array of sources, each of them will be crawled sequentially when calling the scraper.once() method.
name: 'wikipedia-fruits', // name of this source
url: 'https://simple.wikipedia.org/wiki/List_of_fruits', // url of the source
waitBeforeHandler: false, //put the duration in ms you want to wait before retrieving the content of page. This can be useful to wait for JS execution
handler: async function(content, $, spider) { //handler to apply on the content of the page
//console.log(content);
//console.log($);
//console.log(spider);
$('.mw-body-content td.navbox-list a').each(function() {
var text = $(this).text();
var link = $(this).attr('href');
console.log(`- Found ${text} <${link}>`);
})
spider.exit(); //spider object contains phantom objects in properties like spider.instance, spider.page, the source information in spider.opts
//, and two methods spider.exit() used to close the page and destroy the spider (important)
// spider.screenshot(opts) to take a screenshot of the current page.
}
}]
});
//test function
async function test() {
try {
//crawl each source sequentially
await scraper.once();
}
catch (err) {
console.log(err);
}
}
test();
Will output:
/xvi-phantom-scraper$ node test.js
- Found Achene </w/index.php?title=Achene&action=edit&redlink=1>
- Found Berry </wiki/Berry>
- Found Capsule </w/index.php?title=Capsule_(fruit)&action=edit&redlink=1>
- Found Caryopsis </w/index.php?title=Caryopsis&action=edit&redlink=1>
- Found Drupe </wiki/Drupe>
...