1.2.4 • Published 9 years ago

flowesh v1.2.4

Weekly downloads
3
License
ISC
Repository
-
Last release
9 years ago

Flowesh

Flowesh is the non-cluster version of floodesh. It's a middleware based web spider which is lightweight and easy to maintain

Install

npm install flowesh

Usage

function Spider(){
    this.name = 'MySpider'
}

Spider.prototype = {
    seed:[{
        opt:{
            uri:'http://www.baidu.com'
        },
        next:'parse'
    }],
    onInit:function(done){
        // do whatever you want on the init stage
       this.seed.push({
            uri:'http://www.hao123.com',
            priority:3
       });
       this.seed.push({
            uri:'http://www.qq.com',
            priority:1
       });
       done();
    },
    // onData is optional, happens after the response goes through all the middlewares
    onDate:function(dataSet){
        console.log(dataSet.get('data'));
    },
    // onComplete is optional, happens after onData
    onComplete:function(ctx){
        console.log('%s complete', ctx.request.url);
    },
    parse:function(ctx, done){
        console.log(ctx.content.match(/<title>(.*?)<\title>/)[1]);
        // if you have new tasks generated
        ctx.tasks.push({
            opt:{
                uri:'http://www.163.com',
                priority:0
            },
            next:'parse'
        });
        done();
    }
}

const Flowesh = require('flowesh'),
    // request middleware that detects charset of reponse
    charsetparser = require('mof-charsetparser'),
    // request middleware that converts response to utf-8 encoding
    iconv = require('mof-iconv'),
    // response middleware that loads response into a jQuery object which has the same usage as jQuery
    cheerio = require('mof-cheerio'),
    // request middleware that corrects your queue options, e.g. attribute 'jquery' in your queue option will be changed into 'jQuery'
    normalizer = require('mof-normalizer'),
    // request middleware that adapts your queue options to meet request(https://github.com/request/request) requirements
    reqadapter = require('mof-reqadapter');

const config = {
    "schedule":{
	    "concurrent": 1,
	    "rate": 5000,
	    "priorityRange":10 // default 10
	},
	"request":{
		"retry":3
	}
}

const flowesh = new Flowesh(config).attach(new Spider());

// middlewares will be executed in order

flowesh.requestmw.use(normalizer());
flowesh.requestmw.use(reqadapter());

flowesh.responsemw.use(charsetparser());
flowesh.responsemw.use(iconv());
flowesh.responsemw.use(cheerio());

flowesh.start();
1.2.4

9 years ago

1.2.3

9 years ago

1.2.2

9 years ago

1.2.1

9 years ago

1.2.0

9 years ago

1.1.9

9 years ago

1.1.8

9 years ago

1.1.7

9 years ago

1.1.6

9 years ago

1.1.5

9 years ago

1.1.4

9 years ago

1.1.3

9 years ago

1.1.2

9 years ago

1.1.1

9 years ago

1.1.0

9 years ago

1.0.1

10 years ago

1.0.0

10 years ago