0.0.3 • Published 8 years ago
schwabbelini v0.0.3
[api] [github] [npm]
- Schwabbelini is a library for scraping relational data from websites.
Example
- itunes podcasts
import Schwabbelini from '../app/Schwabbelini';
var schwabbelini = new Schwabbelini({
storeConfig: {
client: 'pg',
connection: {
host : 'localhost',
user : 'postgres',
password : 'foobar',
database : 'schwabbelini'
}
},
schedulerConfig: {
delay: 100,
concurrency: 10,
retry : 3
},
transformations: {
aria: document => document.model.attr('aria-label'),
page: document => {
var pages = document.model.find('ul.list.paginate').eq(1).find('li a:not(.selected)');
var hrefs = _.map(pages, elem => document.model.find(elem).attr('href'));
return hrefs.length ? hrefs : document;
}
}
});
var sourceTemplates = {
list: 'https://itunes.apple.com/us/genre/podcasts/id26?mt=2',
subgenre: ['list | ul.top-level-subgenres > li a | href'],
letter: ['subgenre | ul.list.alpha > li a | href'],
page: ['letter | html | page'],
podcast: ['page | #selectedcontent li a | href']
};
var tableTemplates = {
arts_podcast: {
subgenre: 'list | ul.breadcrumb | text',
id: 'podcast | * | url',
letter: 'letter | ul.alpha a.selected | text',
page: 'page | ul.paginate a.selected | text',
title: 'podcast | #title h1 | text',
rating: 'podcast | #left-stack > div.extra-list.customer-ratings > div | aria',
episodes: 'podcast | span.track-count | text'
}
};
schwabbelini
.set(artsTemplates, tableTemplates)
.save();