light-crawler v1.7.3
Light Crawler - Directed Crawler
A simplified directed web crawler, easy to use for scraping pages and downloading resources.
English Doc(Here) or 中文文档.
Install
npm install light-crawler
Example
const Crawler = require('light-crawler');
// create a instance of Crawler
let c = new Crawler();
// add a url or an array to request
c.addTask('http://www.xxx.com');
// define a scraping rule
c.addRule(function (result) {
// result has 2 props : task and body
// result.task: id, url, others you added.
// result.body is the HTML of the page
// scrape result.body, you can use cheerio
})
// start your crawler
c.start().then(() => {
console.log('Finished!');
});
Crawler Property
In light-crawler,requesting page is called task
.Tasks will be put into task-pool and be executed in order.
settings
: basic settings of crawlerid
: id of the crawler,integer or string,defalut:null
interval
: crawling interval,defalut:0
(ms).or a random value in a range e.g.[200,500]
retry
: retry times,defalut:3
concurrency
: an integer for determining how many tasks should be run in parallel,defalut:1
skipDuplicates
: whether skip the duplicate task(same url),defalut:false
requestOpts
: request options of task,this is global request optionstimeout
: defalut:10000
proxy
: proxy addressheaders
: headers of request,defalut:{}
- or other settings in request opts
taskCounter
: count all finished tasks whether they are failed or notfailCounter
: count all failed tasksdoneCounter
: count tasks which has donestarted
: booleanfinished
: booleanerrLog
: record all error infos in crawlingdownloadDir
: downloaded files in here, default:../__dirname
drainAwait
: crawler will be finished when task-pool is drained.This prop will let crawler await adding tasks when task-pool is drained.default:0
(ms)tasksSize
: size of task-pool, exceeding tasks is in the buffer of task-pool, default:50
logger
: show the console log, default:false
Crawler API
Crawler(opts: object)
construtor of
Crawler
// e.g.:
let c = new Crawler({
interval: 1000,
retry: 5,
.... // other props of `crawler.settings`
requestOpts: {
timeout: 5000,
proxy: 'http://xxx'
.... // other props of `crawler.requestOpts`
}
});
tweak(opts: object)
tweak settings of crawler
addTasks(urls: string or array[, props: obejct])
add task into task-pool
// e.g.
// add single task
// input: url
c.addTask('http://www.google.com');
// input: url, prop
// set request options for the task(will override global)
c.addTask('http://www.google.com', {
name: 'google',
requestOpts: { timeout: 1 }
});
// input: url, next(processor of the task)
// crawler rules will not process this task again
c.addTask('http://www.google.com', function (result) {
console.log('the task has done');
});
// input: url, prop, next
c.addTask('http://www.google.com', { name: 'google' }, function (result) {
console.log('the task has done');
});
// or input an object
c.addTask({
url: 'http://www.google.com',
type: 'SE',
next: function (result) {
console.log('the task has done');
}
});
// add multiple tasks
// input: an array of string
c.addTasks(['http://www.google.com','http://www.yahoo.com']);
// add prop for tasks
c.addTasks(['http://www.google.com','http://www.yahoo.com'], { type: 'SE' });
// get these props in processing function
c.addRule(function (result) {
if (result.task.type == 'SE') {
console.log('Searching Engine');
}
});
// input: an array of object
c.addTasks([
{
url: 'http://www.google.com',
name: 'google'
},
{
url: 'http://www.sohu.com',
name: 'sohu'
}
]);
addRule(reg: string|object, func: function)
define a rule for scraping
// e.g.:
let tasks = [
'http://www.google.com/123',
'http://www.google.com/2546',
'http://www.google.com/info/foo',
'http://www.google.com/info/123abc'
];
c.addTasks(tasks);
c.addRule('http://www.google.com/[0-9]*', function (result) {
// match to tasks[0] and tasks[1]
});
c.addRule('http://www.google.com/info/**', function (result) {
// match to tasks[2] and tasks[3]
});
// or you can not define the rule
c.addRule(function (result) {
// match to all url in tasks
});
// $(i.e. cheerio.load(result.body)) is a optional arg
c.addRule(function (result, $){
console.log($('title').text());
});
Tip: light-crawler will transform all
.
in rule string.So you can directly writewww.a.com
instead ofwww\\.a\\.com
. If you need.*
,you can use**
, just like the upper example.If you have to use.
,just<.>
.
start()
start the crawler
// e.g.:
c.start().then(function () {
// on finished
console.log('done!');
});
pause()
pause the crawler
resume()
resume the crawler
isPaused()
the crawler is is paused or not
stop()
stop the crawler
uniqTasks()
reomve duplicate task(deeply compare)
log(info: string, isErr: boolean, type: int)
crawler's logger
// e.g.:
// if it's an error,c.errLog will append it
c.log('some problems', true);
// console print:
// [c.settings.id if it has]some problems
// type is color code of first '[...]', e.g.'[Crawler is Finished]'
// 1 red,2 green,3 yellow,4 blue,5 magenta,6 cyan...so on
c.log('[Parsed]blahblah~', false, 4);
// console print:
// [c.settings.id if it has][Parsed]([Parsed] wil be blue)blahblah~
// you can do something after log() everytime
c.on('afterLog', function (info, isErr, type) {
fs.appendFileSync('c.log', info); // append info to c.log
....
};
// even you can replace the log()
c.log = function (info, isErr, type) {
// log something....
};
Download Files
just add downloadTask: true
for task you need to download
// e.g.:
// specify download directory
c.tweak({ downloadDir: 'D:\\yyy' });
let file = 'http://xxx/abc.jpg';
// 'abc.jpg' will be downloaded into 'D:\\yyy'
c.addTask(file, {downloadTask: true});
// or you can specify its name
c.addTask(file, {downloadTask: true, downloadFile: 'mine.jpg'});
// or specify relative dir(to 'D:\\yyy')
// if this directory ('jpg') doesn't exist,crawler will create it
c.addTask(file, {downloadTask: true, downloadFile: 'jpg/mine.jpg'});
// or specify absolute dir
c.addTask(file, {downloadTask: true, downloadFile: 'C:\\pics\\mine.jpg'});
Events
start
after the crawler is started
// e.g.
c.on('start', function () {
console.log('started!');
});
beforeCrawl
task's props:
id
,url
,retry
,working
,requestOpts
,downloadTask
,downloadFile
...so on
// e.g.
c.on('beforeCrawl', function (task) {
console.log(task);
});
drain
when task-pool and its buffer are drained
// e.g.
c.on('drain', function () {
// do something
});
error
Utils API
getLinks(html: string, baseUrl: string)
get all links in the element
// e.g.:
let html = `
<div>
<ul>
<li>
<a href="http://link.com/a/1">1</a>
<a href="a/2">2</a>
<a href="b/3">3</a>
</li>
<li><a href="4">4</a></li>
<li>foo</li>
</ul>
</div>
`;
let links = Crawler.getLinks(html, 'http://link.com/index.html');
console.log(links);
// ['http://link.com/a/1','http://link.com/a/2','http://link.com/b/3','http://link.com/4']
// you can also use cheerio
let $ = cheerio.load(html);
let links = Crawler.getLinks($('ul'));
getImages(html: string, baseUrl: string)
like
getLinks
, getsrc
from<img>
.loadHeaders(file: string)
load request headers from file
example.headers
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8,en;q=0.6
Cache-Control:max-age=0
Connection:keep-alive
Cookie:csrftoken=Wwb44iw
Host:abc
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64)
...
load this file and set headers for requesting
let headers = Crawler.loadHeaders('example.headers');
c.tweak({
requestOpts: {
headers: headers
}
});
getRegWithPath(fromUrl: string)
get reg string with path of fromUrl
let reg = Crawler.getRegWithPath('http://www.google.com/test/something.html');
// reg: http://www.google.com/test/**
Advanced Usage
addRule
// since 1.5.10, the rule of scraping could be a object
c.addTask('http://www.baidu.com', { name: 'baidu', type: 'S.E.' });
c.addTask('http://www.google.com', { name: 'google', type: 'S.E.' });
// following rules has same reg string, but name are different
c.addRule({ reg: 'www.**.com', name: 'baidu' }, function (r) {
// scraping r.body
});
c.addRule({ reg: 'www.**.com', name: 'google' }, function (r) {
// scraping r.body
});
// using function match could make rules more complex
// boolean match(task)
c.addTask('http://www.baidu.com', { tag: 3 });
c.addTask('http://www.google.com', { tag: 50 });
c.addRule({ reg: 'www.**.com', match: function (task) {
return task.tag > 10;
}}, function (r) {
// scrape google
});
loadRule
recycle rules
// lc-rules.js
exports.crawlingGoogle = {
reg: 'www.**.com',
name: 'google',
scrape: function (r, $) {
// ...
}
};
// crawler.js
let c = new Crawler();
c.addTask('http://www.google.com', { name: 'google' });
c.loadRule(crawlingGoogle);
// or expand the function named 'scrape'
// implement the 'expand' in 'loadRule'
// on the other hand, you can use 'this'(Crawler) in 'addRule' or 'loadRule'
crawlingGoogle = {
// ...
scrape: function (r, $, expand) {
expand($('title').text());
}
};
crawlerAAA.loadRule(crawlingGoogle, function (text) {
console.log(text);
this.addTask('www.abc.com');
});
crawlerBBB.loadRule(crawlingGoogle, function (text) {
console.log(text.toLowerCase());
});
removeRule
remove some rules
// by its 'ruleName'
let rule = {
// ...
ruleName: 'someone'
// ...
}
c.loadRule(rule);
c.removeRule('someone');
7 years ago
7 years ago
7 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago