0.7.9 • Published 11 months ago

@hbauer/scrape v0.7.9

Weekly downloads
-
License
-
Repository
github
Last release
11 months ago

@hbauer/scrape

Installation

$ yarn add @hbauer/scrape
$ npm install @hbauer/scrape

Usage

Basic

import { Scrape } from '@hbauer/scrape'

const baseURL = 'https://httpbin.org' // trailing slash is OK

const httpbin = await Scrape.init(baseURL, {
  contentType, // 'json' | 'html' (defaults to 'json')
  returnRawFetchResponse, // boolean (defaults to false)
  cache: {
    enabled, // boolean (defaults to true)
    rootDirectory, // string (defaults to '__cache')
    name, // can be anything, e.g. 'scraper-name' (defaults to the hostname of the baseURL)
    fileExtension, // can be anything, e.g. 'json' | 'html' (defaults to undefined)
  }
  retry: {
    attempts, // number (defaults to 0)
  }
  throttle: {
    interval, // number (in milliseconds; defaults to 1000)
    limit, // number (defaults to 1)
  }
})

// Scraping a resource is now as simple as:
const file = await httpbin.scrape('uuid')

// Using an absolute URL instead (this results in the the exact same behaviour):
const absolute = await httpbin.scrape('https://httpbin.org/uuid')

// Future calls to the same URL will return from the cache:
const cachedFile = await httpbin.scrape('uuid')

assert.equal(cachedFile.attributes.fromCache, true)

// You can force invalidate a file:
const forceInvalidated = await httpbin.scrape('uuid', { invalidate: { force: true } }) // always re-fetches

assert.equal(cachedFile.attributes.fromCache, false)

// Or invalidate based on how old the file is
const expired = await httpbin.scrape('uuid', { invalidate: { expiredAfter: [1, 'week'] } })

assert.equal(cachedFile.attributes.expired, true)

Cache

import { Scrape } from '@hbauer/scrape'

const baseURL = 'https://httpbin.org'

/**
 * Cache Options
 *
 * enabled - the cache can be enabled or disabled (enabled by default)
 * rootDirectory - PROJECT_ROOT/${rootDirectory}
 * name - alternatively, PROJECT_ROOT/${rootDirectory}/${name}
 * fileExtension - PROJECT_ROOT/${rootDirectory}/${name}/path/file.${fileExtension}
 */
const httpbin = Scrape.init(baseURL, { contentType: 'html' })

const file = await httpbin.scrape('html') // alias for https://httpbin.org/html

// Get the local path to a cached file:
const { fullPath } = await httpbin.cache.getPaths('html') // = '__cache/httpbin.org/html'

// Get the cached file
const cachedFile = await httpbin.cache.get('html')

assert.equal(file.data, cachedFile.data)
0.7.9

11 months ago

0.7.8

11 months ago

0.7.7

11 months ago

0.7.6

11 months ago

0.7.5

11 months ago

0.7.4

11 months ago

0.7.3

11 months ago

0.7.2

11 months ago

0.7.1

11 months ago

0.7.0

12 months ago

0.6.0

12 months ago

0.5.1

12 months ago

0.5.0

12 months ago

0.4.1

12 months ago

0.4.0

12 months ago

0.3.0

12 months ago

0.2.2

12 months ago

0.2.1

12 months ago

0.2.0

12 months ago

0.1.3

12 months ago

0.1.2

12 months ago

0.1.1

12 months ago

0.1.0

12 months ago