0.7.9 • Published 11 months ago
@hbauer/scrape v0.7.9
@hbauer/scrape
Installation
$ yarn add @hbauer/scrape
$ npm install @hbauer/scrape
Usage
Basic
import { Scrape } from '@hbauer/scrape'
const baseURL = 'https://httpbin.org' // trailing slash is OK
const httpbin = await Scrape.init(baseURL, {
contentType, // 'json' | 'html' (defaults to 'json')
returnRawFetchResponse, // boolean (defaults to false)
cache: {
enabled, // boolean (defaults to true)
rootDirectory, // string (defaults to '__cache')
name, // can be anything, e.g. 'scraper-name' (defaults to the hostname of the baseURL)
fileExtension, // can be anything, e.g. 'json' | 'html' (defaults to undefined)
}
retry: {
attempts, // number (defaults to 0)
}
throttle: {
interval, // number (in milliseconds; defaults to 1000)
limit, // number (defaults to 1)
}
})
// Scraping a resource is now as simple as:
const file = await httpbin.scrape('uuid')
// Using an absolute URL instead (this results in the the exact same behaviour):
const absolute = await httpbin.scrape('https://httpbin.org/uuid')
// Future calls to the same URL will return from the cache:
const cachedFile = await httpbin.scrape('uuid')
assert.equal(cachedFile.attributes.fromCache, true)
// You can force invalidate a file:
const forceInvalidated = await httpbin.scrape('uuid', { invalidate: { force: true } }) // always re-fetches
assert.equal(cachedFile.attributes.fromCache, false)
// Or invalidate based on how old the file is
const expired = await httpbin.scrape('uuid', { invalidate: { expiredAfter: [1, 'week'] } })
assert.equal(cachedFile.attributes.expired, true)
Cache
import { Scrape } from '@hbauer/scrape'
const baseURL = 'https://httpbin.org'
/**
* Cache Options
*
* enabled - the cache can be enabled or disabled (enabled by default)
* rootDirectory - PROJECT_ROOT/${rootDirectory}
* name - alternatively, PROJECT_ROOT/${rootDirectory}/${name}
* fileExtension - PROJECT_ROOT/${rootDirectory}/${name}/path/file.${fileExtension}
*/
const httpbin = Scrape.init(baseURL, { contentType: 'html' })
const file = await httpbin.scrape('html') // alias for https://httpbin.org/html
// Get the local path to a cached file:
const { fullPath } = await httpbin.cache.getPaths('html') // = '__cache/httpbin.org/html'
// Get the cached file
const cachedFile = await httpbin.cache.get('html')
assert.equal(file.data, cachedFile.data)
0.7.9
11 months ago
0.7.8
11 months ago
0.7.7
11 months ago
0.7.6
11 months ago
0.7.5
11 months ago
0.7.4
11 months ago
0.7.3
11 months ago
0.7.2
11 months ago
0.7.1
11 months ago
0.7.0
12 months ago
0.6.0
12 months ago
0.5.1
12 months ago
0.5.0
12 months ago
0.4.1
12 months ago
0.4.0
12 months ago
0.3.0
12 months ago
0.2.2
12 months ago
0.2.1
12 months ago
0.2.0
12 months ago
0.1.3
12 months ago
0.1.2
12 months ago
0.1.1
12 months ago
0.1.0
12 months ago