1.4.1 • Published 5 years ago

hcr v1.4.1

Weekly downloads
2
License
ISC
Repository
github
Last release
5 years ago

hcr

The Hcr helps you to grab some data on the web page. It allows you to crawl all site recursively. It supports limiting requests, adding custom headers and converting html to object as you wish.

Depencies

Getting Started

There is an example config that you can modify and use. The callback argument that you passed to constructor is default callback for all functions.

Installation

hcr is available on npm. To install it, type:

$ npm install hcr

Usage

var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

crawler.getPage(['site1.com', 'site2.com']);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#name',
		func: 'text'
	},
	'Image': {
		selector: '#image',
		func: 'attr',
		args: ['src']
	}
};

crawler.toObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#name',
		func: 'text'
	},
	'Image': {
		selector: '#image',
		func: 'attr',
		args: ['src']
	}
};

crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var object = {
	'Name': {
		selector: '#span',
		prop: 'textContent'
	}
};

crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);

var regex = /[A-Z]/g;

var object = {
	'Name': {
		selector: '#span',
		prop: 'textContent'
	}
};

crawler.recursiveRegexToObject(['site1.com', 'site2.com'], regex, object);
var opts = {
	reservoir: 100,
  	reservoirRefreshInterval: 60 * 1000 
};

var crawler = new Crawler(opts, callback);
var doneCallback = function() {
	// crawling done
};

crawler.on('completed', doneCallback);