1.2.8 • Published 9 years ago

site-parser v1.2.8

Weekly downloads
52
License
-
Repository
github
Last release
9 years ago

Parse Websites using custom JSON Templates

TODO

  • Create proper documentation of the template options and workings
  • Add test suite with Jest/Jasmine
  • Publish on npm :D

Introduction:

Parses websites and creates JSON data-structures.

How to use it:

var debug = require('debug')('site-parser:testapp');
var util = require('util');
var site = require('./template');
var SiteParser = require('site-parser');

var siteParser = new SiteParser({
    timeOut: 5000
});

debug('running scraper');

siteParser.parse(site, function(err, entries) {
    if (err) return debug(err);
    debug(util.inspect(entries, { colors: true }));
});

Example Template:

{
    "name": "New Yorker",
    "url": "http://www.newyorker.com/",
    "format": "desktop",
    "template": {
        "containers": [{
            "elements": [{
                "items": [{
                    "attribute": "href",
                    "selector": "section h2 a"
                }],
                "name": "url",
                "occurence": "first",
                "required": true,
                "type": "url"
            }, {
                "items": [{
                    "selector": "section h2 a"
                }],
                "name": "title",
                "occurence": "first",
                "required": true
            }, {
                "items": [{
                    "selector": "section h3 span a"
                }],
                "name": "author",
                "occurence": "first"
            }, {
                "items": [{
                    "selector": "p.p-summary"
                }],
                "name": "description",
                "occurence": "first"
            }, {
                "fallback": null,
                "items": [{
                    "attribute": "src",
                    "selector": "figure a img"
                }, {
                    "attribute": "data-lazy-src",
                    "selector": "figure a img"
                }],
                "name": "image",
                "occurence": "first",
                "type": "url"
            }],
            "selector": "article"
        }]
    }
}

Example Output:

[{
    origin: 'http://www.newyorker.com/',
    url: 'http://www.newyorker.com/magazine/2014/10/27/alan-bean-plus-four',
    author: 'Tom Hanks',
    title: '“Alan Bean Plus Four”',
    description: '“When I tell people that I’ve seen the far side of the moon, they often say, ‘You mean the dark side,’ as though I’d fallen under the spell of Darth Vader or Pink Floyd.”',
    image: 'http://www.newyorker.com/wp-content/uploads/2014/10/141027_r25654-728-375-4.jpg',
    ranking: 1
}, {
    origin: 'http://www.newyorker.com/',
    url: 'http://www.newyorker.com/magazine/2014/10/27/ebola-wars',
    author: 'Richard Preston',
    title: 'The Ebola Wars',
    description: 'As the Ebola epidemic widens, the virus is mutating. Geneticists are racing to keep up.',
    image: 'http://www.newyorker.com/wp-content/uploads/2014/10/141027_r25672-320-240.jpg',
    ranking: 2
}, {
    origin: 'http://www.newyorker.com/',
    url: 'http://www.newyorker.com/magazine/2014/10/27/voting-numbers',
    author: 'Jelani Cobb',
    title: 'Who’s Missing from the Midterms?',
    description: 'Why our government doesn’t look like the people it’s supposed to represent.',
    image: 'http://www.newyorker.com/wp-content/uploads/2014/10/141027_r25680illurgb-320-240.jpg',
    ranking: 3
}
.....]
1.2.8

9 years ago

1.2.7

9 years ago

1.2.6

9 years ago

1.2.5

9 years ago

1.2.4

9 years ago

1.2.3

9 years ago

1.2.2

9 years ago

1.2.1

9 years ago

1.2.0

9 years ago

1.1.0

9 years ago

1.0.2

9 years ago

1.0.1

9 years ago