0.0.2 • Published 10 years ago

pauk v0.0.2

Weekly downloads
2
License
MIT
Repository
github
Last release
10 years ago

pauk

Basic Node.js web crawler.

Install

$ npm install pauk

Example:

var Pauk = require('pauk'),
    pauk = new Pauk({maxRequests: 10});

pauk.onFinish = function(cache) {
    cache.forEach(function(v, uri) {
        if (v.error) console.log("Error:\n" + v.error);
	    else console.log("Links:\n" + v.links.join("\n"), "\nCrawled Parents:\n" + v.parents.join("\n") + "\n");
    });
};

pauk.crawl('http://example.com');

Test

$ grunt

API

new Pauk(config)

Constructor. Configuration defaults:

www: true, // if true, www.example.com will resolve to example.com
maxRequests: 5, // maximum number of total requests
ignoreQuery: true // if true, query part of the URI will be ignored

Object passed to the constructor will be merged to default configuration object.

Example:

var pauk = new Pauk({maxRequests: 55});

crawl(uri, parent)

Main method that will crawl the webpage.

  • uri - URI to be crawled
  • parent - URI of the page that contains link to uri

Example:

var pauk = new Pauk();
pauk.crawl('http://example.com');

onFinish

Called when crawling is finished. Parameter passed to this function is object containing the crawling data. Keys of this object are normalized URIs and values are objects containing:

{
    error: 'string', // (optional) in case of an error
    assets: {
        images: [], // values of src attributes of 'img' tags
        scripts: [], // values of src attributes of 'script' tags
        css: [], // values of href attributes of 'link' tags
        other: [] // values of href attributes of 'a' tags that are not URIs
    },
    links: [], // values of href attributes of 'a' tags (relative are converted to absolute)
    external: [] // values of href attributes of 'a' tags that are external links
}

Example:

var pauk = new Pauk();
pauk.onFinish = function(cache) {
    cache.forEach(function(v, uri) {
        if (v.error) console.log(uri + "\n" + v.error);
	    else console.log(uri + ": OK");
    });
}
pauk.crawl('http://example.com');