scrapyard v0.2.2
Scrapyard
Scrapyard makes scraping websites easy. I'ts a wrapper for most the things you need, comes with optional caching and retries, and opens as many connections as you like.
Installation
npm install scrapyardUsage
var scrapyard = require("scrapyard");
var scraper = new scrapyard({
debug: true,
retries: 5,
connections: 10,
cache: './storage',
bestbefore: "5min"
});retriesnumber of times the scraper attempts to fetch the url before giving up. default: 5connectionsnumber of concurrent connections a scraper will make. setting this too high could be considered as a ddos so be polite and keep this reasonablecacheis a folder, where scraped contents are cached. by default caching is off.bestbeforetime your cache is valid, either an int of milliseconds or a string, valid forever when 0
Call
scraper(options, callback);
or simply
scraper(url, callback);
The first argument can be either a url string or an options object. url is the only option required.
urlis a string containing the HTTP URLtypeis either'html','xml','json'or'raw'(default:'html')methodis the HTTP method (default:'GET')formis an object containing your formdataencodingis passed tohttp.setEncoding()(default:'binary')callback(err, data)is the callback method
Although scrapyard has only been tested with these 6 options, you can try to set any option available for request.
Examples
var scrapyard = require("scrapyard");
var scraper = new scrapyard({
cache: './storage',
debug: true,
timeout: 300000,
retries: 5,
connections: 10
});
// html, passes you a jquery-like `cheerio` object
scraper('http://example.org/test.html', function(err, $) {
if (err) return console.error(err);
console.log($('h1').text());
});
// post something
scraper({
url: 'http://example.org/test.html',
type: 'html',
encoding: 'binary',
method: 'POST',
form: {key1: 'value1', key2: 'value2'}
}, function(err, $) {
if (err) return console.error(err);
console.log($('h1').text());
});
// xml, converts xml to a javascript object with `xml2js`
scraper({
url: 'http://example.org/test.xml',
type: 'xml',
encoding: 'utf8'
}, function(err, xml) {
if (err) return console.error(err);
console.log(xml);
});
// json, as delivered by `json.stringify`
scraper({
url: 'http://example.org/test.json',
type: 'json',
}, function(err, json){
if (err) return console.error(err);
console.log(json);
});
// raw, just pass on whatever the webserver spits out
scraper({
url: 'http://example.org/test.bin',
type: 'raw',
}, function(err, data){
if (err) return console.error(err);
console.log(data);
});Tor
It's possible to use scrapyard with tor using the socks5-http-client module:
var scrapyard = require("scrapyard");
var scraper = scrapyard();
var Agent = require('socks5-http-client/lib/Agent');
scraper({
url: "http://freepress3xxs3hk.onion/about",
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"
},
agentClass: Agent,
agentOptions: {
socksHost: 'localhost',
socksPort: 9050
},
method: "GET",
type: "html",
encoding: "utf-8"
}, function(err, $){
if (err) return console.log(err);
$(".content p").each(function(){
console.log($(this).text());
});
});2 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
7 years ago
7 years ago
9 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
11 years ago
11 years ago
11 years ago
11 years ago
12 years ago
12 years ago
12 years ago
13 years ago