0.0.8 • Published 9 years ago
rakes v0.0.8
#rakes
rakes is a web scraper for Node.JS
Version
0.0.4
Installation
Install locally (for now):
$ npm install rakes
Setup
Sites.json
Configuration of sites to be crawled.
Required Values:
- url : url of site
- id : unique id within this json
scraper : path to scraper relative to rakes.js
Optional Values:
- iterator : if iterating over sites
- limit : index where iterator stops
- startIndex : index where iterator starts
EX :
[
{
"url":"http://www.craigslist.org/about/sites",
"id":"cl",
"scraper":"../../scrapers/cl"
},
{
"url":"https://news.ycombinator.com/news?p=",
"iterator":"1",
"limit":"5",
"startIndex":"1",
"id":"yc",
"scraper":"../../scrapers/yc"
}
]
###Scrapers
Scrapers are modules that are used to parse the html from individual entries in your sites.json
Parameters - html - html from response - app - reference to rakes - cheerio - lets us use jquery like syntax
Public Properties: 'app.scrapeData' : The params property that holds the collective scraped data
Events: - 'scraperComplete' : to be dispatched when scraper has fully parsed html.
EX:
module.exports.scraper = function(html,app,cheerio) {
var $ = cheerio.load(html);
$('td.title').each(function(){
var el = $(this);
var title = el.text();
app.scrapeData.push(title);
})
app.pubsub.emit('scraperComplete')
}
###Instantiation Rake(sites.json , callback)
var express = require('express');
var sites = require('./sites');
var rakes = require('./rakes');
var app = express();
var PORT = process.env.PORT || '8081';
app.listen(PORT)
exports = module.exports = app;
rakes.Rake(sites,function(data){
console.log(data)
process.kill();
});
Dependencies
- Cheerio
Request
Todo's
- fix global install
- get a haircut
License
MIT