0.2.0 • Published 6 years ago

crawlmatic v0.2.0

Weekly downloads
2
License
ASF-2.0
Repository
github
Last release
6 years ago

crawlmatic

npm package

Build Status Coverage Status

Single library for static or dynamic website crawling needs. A standard wrapper for HCCrawler & node-crawler, based on bluebird promises. Install using npm:

npm i crawlmatic --save

Dynamic crawling example:

const {
  DynamicCrawler
} = require('crawlmatic');

// Initialize with HCCrawler options
const crawler = new DynamicCrawler({
  //dynamically evaluate page title
  evaluatePage: (() => ({
    title: $('title').text(),
  }))
});
//Setup - resolved when Chromium instance is up
crawler.setup().then(() => {

  // make requests with HCCrawler queue options
  crawler.request({
    url: "http://example.com"
  }).then((resp) => {
    console.log(resp.result.title);

    // destroy the instance
    process.nextTick(() => crawler.destroy())
  })

});

Static crawling example:

const {
  StaticCrawler
} = require('crawlmatic');
//Initialize with node-crawler options
const staticCrawler = new StaticCrawler({
  maxConnections: 10,
  retries: 3
});

//setup internal node-crawler instance and resolves promise
staticCrawler.setup().then(() => {
  // makes request with node-crawler queue options
  staticCrawler.request({
    url: 'http://example.com'
  }).then((resp) => {
    //server side response parsing using cheerio
    let $ = res.$;
    console.log($("title").text());

    // destroy the instance
    process.nextTick(() => crawler.destroy())
  })
});