1.0.4 • Published 8 years ago

site-crawler v1.0.4

Weekly downloads
3
License
MIT
Repository
github
Last release
8 years ago

Crawler

Simple site crawler for node.js

Install

npm install site-crawler

Example Codes

var Crawler = require('site-crawler')

var site = 'https://techcrunch.com'

var crawler = new Crawler({
	// default is 10
	concurrency:10
})
crawler
.on('found',function(url,next) {
	var ok = url.startsWith(site)
	if(ok) console.error('found:',url)
	// set null argument for next if reject crawling this url.(or you can modify url)
	next(ok ? url : null)
})
.on('crawl',function(url,res,$,next) {
	// res is response object of resuest module
	// $ is cheerio object
	console.error('\tcrawl:',$('title').text())
	next()
})
.on('error',function(url,err) {
	console.error('\terror:',url,':',err.statusCode)
})
.on('complete',function() {
	console.log('done.')
})
crawler.start(site)

Tests

cd crawler
npm test

Licence

MIT

1.0.4

8 years ago

1.0.3

8 years ago

1.0.1

8 years ago

1.0.0

8 years ago