0.0.9 • Published 2 years ago

@satankebab/scraping-utils v0.0.9

Weekly downloads
-
License
MIT
Repository
github
Last release
2 years ago

Scraping Utils

Set of utils and queues to make web scraping easy.

Features:

  • automatic retrying on errors
  • definining minimum and maximum time of a message to be processed
  • parallelism can be configured
  • automatic initializatio & clean up of puppeteer browser (and pages)

Installation

npm install @satankebab/scraping-utils;

Examples

With Puppeteer

import { Page, createQueue, PartialSubscriber, runPuppeteerQueue } from "@satankebab/scraping-utils"

const crawlSomething = async () => {
  // Define a shape of the data we want to store in our queue
  type Payload = {
    url: string
    attempt: number
  }
  
  // Create a queue
  const queue = createQueue<Payload>({
    parallelLimit: 2
  })
  
  // Add some initial items to the queue
  queue.enqueue({
    url: 'https://fsharpforfunandprofit.com/',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://github.com/gcanti/fp-ts',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://elm-lang.org/',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://github.com/Effect-TS/core',
    attempt: 0,
  })
  
  // Define our 'crawler' and the type of the payload with puppeteer page
  type PayloadWithPage = Payload & {
    page: Page,
  }
  const crawler: PartialSubscriber<PayloadWithPage, Payload> = {
    // For each payload, call this function
    next: async ({ page, url }) => {
      await page.goto(url);
      console.log('The first 100 chars from response:', (await page.content()).slice(0, 100))
      // You can call queue.enqueue here to add more items to crawl here, for example:
      if (url === 'https://github.com/Effect-TS/core') {
        queue.enqueue({
          url: 'https://github.com/Effect-TS/monocle',
          attempt: 0,
        })
      }
    },
    // When there is an error, we want to be notified about it along with the original payload that caused the error.
    error: (error, payload) => {
      console.error(`Upps, could not process ${payload.url}, error: `, error, `, attempt: ${payload.attempt}`)
    },
  }
  
  // Consume payloads from the queue and resolve when the queue is empty
  await runPuppeteerQueue({
    queue,
    crawler,
    // Max time in ms crawler's next function can be processing (timeouting crawling that takes too long)
    maxProcessingTime: 2 * 60 * 1000,
    // Min time in ms crawler's next function can be processing (slowing down crawling that is too fast)
    minProcessingTime: 5 * 1000,
    // How many numbers do we want to try the same payload until we call crawler's .error method
    retryAttempts: 2,
    // Options that are directly passed to puppeteer's .launch method
    puppeteerLaunchOptions: { headless: false } 
  })

  console.log('We are done!')
}


crawlSomething()

Without Puppeteer

import { createQueue, PartialSubscriber, runBasicQueue } from "@satankebab/scraping-utils"
import fetch from "node-fetch";

const crawlSomething = async () => {
  // Define a shape of the data we want to store in our queue
  type Payload = {
    url: string
    attempt: number
  }
  
  // Create a queue
  const queue = createQueue<Payload>({
    parallelLimit: 2
  })
  
  // Add some initial items to the queue
  queue.enqueue({
    url: 'https://fsharpforfunandprofit.com/',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://github.com/gcanti/fp-ts',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://elm-lang.org/',
    attempt: 0,
  })
  queue.enqueue({
    url: 'https://github.com/Effect-TS/core',
    attempt: 0,
  })
  
  // Define our 'crawler'
  const crawler: PartialSubscriber<PayloadWithPage, Payload> = {
    // For each payload, call this function
    next: async ({ url }) => {
      const response = await fetch(url);
      console.log('The first 100 chars from response:', (await response.text()).slice(0, 100))
      // You can call queue.enqueue here to add more items to crawl here, for example:
      if (url === 'https://github.com/Effect-TS/core') {
        queue.enqueue({
          url: 'https://github.com/Effect-TS/monocle',
          attempt: 0,
        })
      }
    },
    // When there is an error, we want to be notified about it along with the original payload that caused the error.
    error: (error, payload) => {
      console.error(`Upps, could not process ${payload.url}, error: `, error, `, attempt: ${payload.attempt}`)
    },
  }
  
  // Consume payloads from the queue and resolve when the queue is empty
  await runBasicQueue({
    queue,
    crawler,
    // Max time in ms crawler's next function can be processing (timeouting crawling that takes too long)
    maxProcessingTime: 2 * 60 * 1000,
    // Min time in ms crawler's next function can be processing (slowing down crawling that is too fast)
    minProcessingTime: 5 * 1000,
    // How many numbers do we want to try the same payload until we call crawler's .error method
    retryAttempts: 2,
  })

  console.log('We are done!')
}


crawlSomething()
0.0.9

2 years ago

0.0.8

2 years ago

0.0.3

3 years ago

0.0.5

3 years ago

0.0.4

3 years ago

0.0.7

2 years ago

0.0.6

2 years ago

0.0.2

3 years ago

0.0.1

3 years ago