0.0.54 • Published 9 months ago

@letsscrapedata/controller v0.0.54

Weekly downloads
-
License
Apache-2.0
Repository
github
Last release
9 months ago

Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.

Features

This package is used by @letsscrapedata/scraper to facilitate switching between different types of browser controllers and to facilitate support for the new anti-bot browser controller without modifying existing programs.

  • Same interface of playwright, puppeteer, cheerio (more to come): easy to switch between them
  • Web browsing automation: goto(open) / click / input / hover / select / scroll
  • State data management: cookies, localStorage, HTTP Headers, custom session data
  • Request and response interception management: data and HTTP headers
  • Elements selection by CSS selectors or XPath: whether in frames or not
  • Element's attributes: innerHtml, innerText, outerHtml, textContent, etc
  • Automatic file saving: such as screenshot, pdf
  • CDP message
  • Page evaluate
  • Completed the functions that are not supported by individual browser controllers or provided workarounds for known issues

Install

npm install @letsscrapedata/controller

Examples

import { controller } from "@letsscrapedata/controller";

const browser = await controller.launch("playwright", "chromium", { headless: false });
const browserContext = await browser.newBrowserContext();
const page = await browserContext.getPage();

await page.goto("https://www.letsscrapedata.com/pages/listexample1.html");
await page.screenshot({path: "screenshot.png"});
await browser.close();

Same interfaces

  • LsdElement
  • LsdPage
  • LsdBrowserContext
  • LsdBrowser
  • LsdBrowserController

LsdPage

export interface LsdPage extends EventEmitter {
  /**
   * Get the LsdApiContext associated with this page's LsdBrowserContext
   * * only vaild in playwright
   */
  apiContext(): LsdApiContext;

  bringToFront(): Promise<boolean>;

  browserContext(): LsdBrowserContext;

  /**
   * clear the cookies of the current page(url)
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearCookies(): Promise<boolean>;

  /**
   * clear the localStorage of the current page(url)
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearLocalStorage(): Promise<boolean>;

  /**
   * Clear all request interceptions on the page
   */
  clearRequestInterceptions(): Promise<boolean>;
  /**
   * Clear all response interceptions on the page
   */
  clearResponseInterceptions(): Promise<boolean>;

  /**
   * clear the stateData of the current page(url):
   * * stateData: cookies, localStorage, indexedDB
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearStateData(): Promise<boolean>;

  /**
   * Only free page can be closed!
   */
  close(): Promise<boolean>;

  /**
   * Get the full HTML content of the page or decendant frame
   * @param iframeOptions default [], selectors of decendant frames
   */
  content(iframeOptions?: IframeOption[]): Promise<string>;

  cookies(): Promise<CookieItem[]>;

  evalute(fun: Function, args?: any[]): Promise<any>;

  /**
   * @returns the first element matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   */
  findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement | null>;

  /**
   * @returns elements matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   */

  findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement[]>;

  /**
   * Free a busy page. All request and response interceptions will be cleared.
   */
  free(): Promise<boolean>;

  /**
   * @returns whether the element has the specified attribute or not 
   * @param attributeName 
   */
  goto(url: string, options?: GotoOptions): Promise<boolean>;

  id(): string;

  isFree(): boolean;

  /**
   * valid only in CheerioPage
   * @param html 
   * @param isHtml default true
   */
  load(html: string, isHtml?: boolean): boolean;

  localStroage(): Promise<LocalStorageOrigin[]>;

  mainFrame(): AllFrame;

  maximizeViewport(): Promise<boolean>;

  pageHeight(): Promise<number>;

  pageInfo(): PageInfo;

  pageWidth(): Promise<number>;

  pdf(options?: PDFOptions): Promise<Buffer>;

  screenshot(options?: ScreenshotOptions): Promise<Buffer>;

  scrollBy(x: number, y: number): Promise<boolean>;

  scrollTo(x: number, y: number): Promise<boolean>;

  /**
   * 
   * Send a CDP message over the current(not detached) or new CDP session
   * @param method protocol method name
   * @param params default null(ignored), method parameters
   * @param detach default true, whether to detach the CDPSession from target
   */
  sendCDPMessage(method: string, params?: object | null, detach?: boolean): Promise<any>;

  setCookies(cookies: CookieItem[]): Promise<boolean>;

  setExtraHTTPHeaders(headers: Record<string, string>): Promise<boolean>;

  /**
   * set localStorage on the current web page(page.url()) 
   * @param localStorageItems 
   */
  setLocalStroage(localStorageItems: LocalStorageItem[]): Promise<boolean>;

  setPageInfo(pageInfo: UpdatablePageInfo): boolean;

  /**
   * Intercept requests that meet the conditions(requestMatch) to perform an action(action and fulfill).
   * @param options
   */
  setRequestInterception(options: RequestInterceptionOption | RequestInterceptionOption[]): Promise<boolean>;
  /**
   * Intercept responses that meet the conditions(requestMatch and responseMatch) to perform actions(cacheArray and handler )
   * @param options
   */
  setResponseInterception(options: ResponseInterceptionOption | ResponseInterceptionOption[]): Promise<boolean>;

  /**
   * Shortcut for LsdPage.browserContext().setStateData(stateData)
   * @param stateData 
   */
  setStateData(stateData: BrowserStateData): Promise<boolean>;

  /**
   * valid only in puppeteer
   * @param userAgent
   */
  setUserAgent(userAgent: string): Promise<boolean>;

  setViewportSize(viewPortSize: ViewportSize): Promise<boolean>;

  stateData(): Promise<BrowserStateData>;

  status(): PageStatus;

  title(): Promise<string>;

  url(): string;

  /**
   * start to use this free page
   */
  use(): boolean;

  /**
   * 
   * @param selector CSS selector, not XPath
   * @param options 
   */
  waitForElement(selector: string, options?: WaitElementOptions): Promise<boolean>;

  /**
   * 
   * @param options 
   */
  waitForNavigation(options: WaitNavigationOptions): Promise<boolean>;

  /**
   * obj=window?.[key1]...?.[keyn]
   * @return obj ? JSON.stringify(obj) : ""
   * @param keys 
   */
  windowMember(keys: string[]): Promise<string>;

  _origPage(): AllPage;
}

LsdElement

export interface LsdElement {
  ///////////////////////////////////////////////////////////////////////////////    methods used to extract data from the element
  /**
   * 
   * @return the value of a specified attribute on the element
   * @param attributeName 
   */
  attribute(attributeName: string): Promise<string>;
  /**
   * @returns the attribute names of the element
   */
  attributeNames(): Promise<string[]>;
  /**
   * @returns the first element matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   * @param absolute valid only if iframeOptions.length===0
   */
  findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement | null>;
  /**
   * @returns elements matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   * @param absolute valid only if iframeOptions.length===0
   */
  findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement[]>;
  /**
   * @returns whether the element has the specified attribute or not 
   * @param attributeName 
   */
  hasAttribute(attributeName: string): Promise<boolean>;
  /**
   * @returns the HTML or XML markup contained within the element
   */
  innerHtml(): Promise<string>;

  /**
   * @returns innerText of element
   * @param onlyChild default false, whether to include only the text of the child text nodes
   */
  innerText(onlyChild?: boolean): Promise<string>;
  /**
   * @returns the serialized HTML fragment describing the element including its descendants
   */
  outerHtml(): Promise<string>;
  textContent(): Promise<string>;

  ///////////////////////////////////////////////////////////////////////////////    methods to operate the element(only valid for browser)
  /**
   * Click this element.
   * @param options default {button: "left", count: 1, delay: 0, modifies: []}
   */
  click(options?: MouseClickOptions): Promise<boolean>;
  focus(): Promise<boolean>;
  hover(): Promise<boolean>;
  /**
   * * playwright: fill
   * * puppeteer: type
   */
  input(value: string, options?: InputOptions): Promise<boolean>;
  press(key: KeyInput, options: KeyPressOptions): Promise<boolean>;
  screenshot(options?: ScreenshotOptions): Promise<Buffer>;
  scrollIntoView(): Promise<boolean>;
  select(options: SelectOptions): Promise<boolean>;
  setAttribute(attributeName: string, newValue: string): Promise<boolean>;
  _origElement(): AllElement;
}

LsdBrowserContext

export interface LsdBrowserContext extends EventEmitter {
  /**
   * Get the LsdApiContext associated with this LsdBrowserContext
   * * only vaild in playwright
   */
  apiContext(): LsdApiContext;

  browser(): LsdBrowser;

  close(): Promise<boolean>;

  /**
   * close pages that are free more than maxPageFreeSeconds if maxPageFreeSeconds > 0
   * * but the last page in the browserContext will not be closed
   * @default 0 the default maxPageFreeSeconds of the browserContext will be used
   */
  closeFreePages(maxPageFreeSeconds?: number): Promise<boolean>;

  /**
   * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
   * @param browserContextRequirements
   */
  doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;

  /**
   * get a free page from current pages or by creating a new page
   */
  getPage(always?: boolean): Promise<LsdPage | null>;

  /**
   * whether can get a number of free page(s)
   * * refer to getPage()
   * @param pageNum default 1, the number of free pages
   */
  hasFreePage(pageNum?: number): boolean;

  id(): string;

  isIncognito(): boolean;

  page(pageIdx: number): LsdPage | null;

  pages(): LsdPage[];

  proxy(): ProxyInController | null; // 备用

  setStateData(stateData: BrowserStateData): Promise<boolean>;

  _origBrowserContext(): AllBrowserContext;
}

LsdBrowser

export interface LsdBrowser extends EventEmitter {
  // By default, constructor can be called in LsdBrowserController.launch/connect to create new instance
  // main methods
  newBrowserContext(options?: LsdBrowserContextOptions): Promise<LsdBrowserContext | null>;
  /**
   * 1. launched: close all browserContexts and this browser
   * 2. connected: 
   * * in puppeteer:  close all browserContexts and this browser???
   * * in playwright: only browserContexts created by newContext will be closed, browser is disconnected and browser will not be closed
   */
  close(): Promise<boolean>;

  // other methods
  browserContexts(): LsdBrowserContext[];
  browserControllerType(): BrowserControllerType;
  browserCreationMethod(): BrowserCreationMethod;
  browserType(): LsdBrowserType;

  /**
   * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
   * @param browserContextRequirements
   */
  doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;

  /**
   * @returns
   * 1. launched: actual executable path
   * 2. connected: exectuablePath in LsdConnectOptions, default ""(unkown)
   */
  executablePath(): string;

  id(): string;
  isConnected(): boolean;
  isHeadless(): boolean;
  options(): LsdLaunchOptions | LsdConnectOptions;
  /**
   * * puppeteer: return pid of connected or launched browser
   * * playwright: return pid of connected browser that is launched manually or using launchServer, or else return 0
   */
  pid(): number;
  /**
   * get the cpu utility(%) and memory usage(MB) of browser processes if pid is greater than 0 (refer to pid())
   */
  pidUsage(): Promise<{ cpu: number, memory: number }>;
  version(): Promise<string>; // playwright: sync; puppeteer: async

  _origBrowser(): AllBrowser;
}

LsdBrowserController

export interface LsdBrowserController {
  /**
   * launch a new browser using related browser controller
   * @param browserControllerType 
   * @param browserType 
   * @param options 
   */
  launch(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdLaunchOptions): Promise<LsdBrowser>;

  /**
   * connect to the current browser using related browser controller
   * @param browserControllerType 
   * @param browserType 
   * @param options 
   */
  connect(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdConnectOptions): Promise<LsdBrowser>;

  /**
   * 
   * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on puppeteer-extra
   */
  setPuppeteerNode(puppeteer: PuppeteerNode | null): boolean;

  /**
   * 
   * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on playwright-extra
   */
  setPlaywrightBrowserType(browserType: LsdBrowserType, playwrightBrowserType: BrowserType | null): boolean;

  /**
   * Create a new LsdApiContext, valid in playwright;
   */
  newApiContext(options?: LsdApiContextOptions): Promise<LsdApiContext>;
}
0.0.40

1 year ago

0.0.41

12 months ago

0.0.42

12 months ago

0.0.43

11 months ago

0.0.44

11 months ago

0.0.45

10 months ago

0.0.46

10 months ago

0.0.47

10 months ago

0.0.38

1 year ago

0.0.39

1 year ago

0.0.51

10 months ago

0.0.52

9 months ago

0.0.53

9 months ago

0.0.54

9 months ago

0.0.50

10 months ago

0.0.48

10 months ago

0.0.49

10 months ago

0.0.37

1 year ago

0.0.20

2 years ago

0.0.21

2 years ago

0.0.22

2 years ago

0.0.23

2 years ago

0.0.24

2 years ago

0.0.25

2 years ago

0.0.15

2 years ago

0.0.16

2 years ago

0.0.17

2 years ago

0.0.18

2 years ago

0.0.19

2 years ago

0.0.30

2 years ago

0.0.31

2 years ago

0.0.32

2 years ago

0.0.10

2 years ago

0.0.33

2 years ago

0.0.11

2 years ago

0.0.34

2 years ago

0.0.12

2 years ago

0.0.35

2 years ago

0.0.13

2 years ago

0.0.36

2 years ago

0.0.14

2 years ago

0.0.3

2 years ago

0.0.26

2 years ago

0.0.9

2 years ago

0.0.27

2 years ago

0.0.8

2 years ago

0.0.28

2 years ago

0.0.29

2 years ago

0.0.5

2 years ago

0.0.4

2 years ago

0.0.7

2 years ago

0.0.6

2 years ago

0.0.2

2 years ago

0.0.1

2 years ago