scrapyard v0.2.2
Scrapyard
Scrapyard makes scraping websites easy. I'ts a wrapper for most the things you need, comes with optional caching and retries, and opens as many connections as you like.
Installation
npm install scrapyard
Usage
var scrapyard = require("scrapyard");
var scraper = new scrapyard({
debug: true,
retries: 5,
connections: 10,
cache: './storage',
bestbefore: "5min"
});
retries
number of times the scraper attempts to fetch the url before giving up. default: 5connections
number of concurrent connections a scraper will make. setting this too high could be considered as a ddos so be polite and keep this reasonablecache
is a folder, where scraped contents are cached. by default caching is off.bestbefore
time your cache is valid, either an int of milliseconds or a string, valid forever when 0
Call
scraper(options, callback);
or simply
scraper(url, callback);
The first argument can be either a url
string or an options
object. url
is the only option required.
url
is a string containing the HTTP URLtype
is either'html'
,'xml'
,'json'
or'raw'
(default:'html'
)method
is the HTTP method (default:'GET'
)form
is an object containing your formdataencoding
is passed tohttp.setEncoding()
(default:'binary'
)callback(err, data)
is the callback method
Although scrapyard has only been tested with these 6 options, you can try to set any option available for request.
Examples
var scrapyard = require("scrapyard");
var scraper = new scrapyard({
cache: './storage',
debug: true,
timeout: 300000,
retries: 5,
connections: 10
});
// html, passes you a jquery-like `cheerio` object
scraper('http://example.org/test.html', function(err, $) {
if (err) return console.error(err);
console.log($('h1').text());
});
// post something
scraper({
url: 'http://example.org/test.html',
type: 'html',
encoding: 'binary',
method: 'POST',
form: {key1: 'value1', key2: 'value2'}
}, function(err, $) {
if (err) return console.error(err);
console.log($('h1').text());
});
// xml, converts xml to a javascript object with `xml2js`
scraper({
url: 'http://example.org/test.xml',
type: 'xml',
encoding: 'utf8'
}, function(err, xml) {
if (err) return console.error(err);
console.log(xml);
});
// json, as delivered by `json.stringify`
scraper({
url: 'http://example.org/test.json',
type: 'json',
}, function(err, json){
if (err) return console.error(err);
console.log(json);
});
// raw, just pass on whatever the webserver spits out
scraper({
url: 'http://example.org/test.bin',
type: 'raw',
}, function(err, data){
if (err) return console.error(err);
console.log(data);
});
Tor
It's possible to use scrapyard with tor using the socks5-http-client
module:
var scrapyard = require("scrapyard");
var scraper = scrapyard();
var Agent = require('socks5-http-client/lib/Agent');
scraper({
url: "http://freepress3xxs3hk.onion/about",
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"
},
agentClass: Agent,
agentOptions: {
socksHost: 'localhost',
socksPort: 9050
},
method: "GET",
type: "html",
encoding: "utf-8"
}, function(err, $){
if (err) return console.log(err);
$(".content p").each(function(){
console.log($(this).text());
});
});
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
4 years ago
6 years ago
7 years ago
8 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
12 years ago
12 years ago