0.0.54 • Published 7 months ago

@letsscrapedata/controller v0.0.54

Weekly downloads
-
License
Apache-2.0
Repository
github
Last release
7 months ago

Please get help and discuss how to scrape a website on the discord server, which can respond quickly. It is better to submit issues on github for better tracking.

Features

This package is used by @letsscrapedata/scraper to facilitate switching between different types of browser controllers and to facilitate support for the new anti-bot browser controller without modifying existing programs.

  • Same interface of playwright, puppeteer, cheerio (more to come): easy to switch between them
  • Web browsing automation: goto(open) / click / input / hover / select / scroll
  • State data management: cookies, localStorage, HTTP Headers, custom session data
  • Request and response interception management: data and HTTP headers
  • Elements selection by CSS selectors or XPath: whether in frames or not
  • Element's attributes: innerHtml, innerText, outerHtml, textContent, etc
  • Automatic file saving: such as screenshot, pdf
  • CDP message
  • Page evaluate
  • Completed the functions that are not supported by individual browser controllers or provided workarounds for known issues

Install

npm install @letsscrapedata/controller

Examples

import { controller } from "@letsscrapedata/controller";

const browser = await controller.launch("playwright", "chromium", { headless: false });
const browserContext = await browser.newBrowserContext();
const page = await browserContext.getPage();

await page.goto("https://www.letsscrapedata.com/pages/listexample1.html");
await page.screenshot({path: "screenshot.png"});
await browser.close();

Same interfaces

  • LsdElement
  • LsdPage
  • LsdBrowserContext
  • LsdBrowser
  • LsdBrowserController

LsdPage

export interface LsdPage extends EventEmitter {
  /**
   * Get the LsdApiContext associated with this page's LsdBrowserContext
   * * only vaild in playwright
   */
  apiContext(): LsdApiContext;

  bringToFront(): Promise<boolean>;

  browserContext(): LsdBrowserContext;

  /**
   * clear the cookies of the current page(url)
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearCookies(): Promise<boolean>;

  /**
   * clear the localStorage of the current page(url)
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearLocalStorage(): Promise<boolean>;

  /**
   * Clear all request interceptions on the page
   */
  clearRequestInterceptions(): Promise<boolean>;
  /**
   * Clear all response interceptions on the page
   */
  clearResponseInterceptions(): Promise<boolean>;

  /**
   * clear the stateData of the current page(url):
   * * stateData: cookies, localStorage, indexedDB
   * * Prerequisites: page must has a valid url, such as by calling goto(url)
   */
  clearStateData(): Promise<boolean>;

  /**
   * Only free page can be closed!
   */
  close(): Promise<boolean>;

  /**
   * Get the full HTML content of the page or decendant frame
   * @param iframeOptions default [], selectors of decendant frames
   */
  content(iframeOptions?: IframeOption[]): Promise<string>;

  cookies(): Promise<CookieItem[]>;

  evalute(fun: Function, args?: any[]): Promise<any>;

  /**
   * @returns the first element matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   */
  findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement | null>;

  /**
   * @returns elements matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   */

  findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[]): Promise<LsdElement[]>;

  /**
   * Free a busy page. All request and response interceptions will be cleared.
   */
  free(): Promise<boolean>;

  /**
   * @returns whether the element has the specified attribute or not 
   * @param attributeName 
   */
  goto(url: string, options?: GotoOptions): Promise<boolean>;

  id(): string;

  isFree(): boolean;

  /**
   * valid only in CheerioPage
   * @param html 
   * @param isHtml default true
   */
  load(html: string, isHtml?: boolean): boolean;

  localStroage(): Promise<LocalStorageOrigin[]>;

  mainFrame(): AllFrame;

  maximizeViewport(): Promise<boolean>;

  pageHeight(): Promise<number>;

  pageInfo(): PageInfo;

  pageWidth(): Promise<number>;

  pdf(options?: PDFOptions): Promise<Buffer>;

  screenshot(options?: ScreenshotOptions): Promise<Buffer>;

  scrollBy(x: number, y: number): Promise<boolean>;

  scrollTo(x: number, y: number): Promise<boolean>;

  /**
   * 
   * Send a CDP message over the current(not detached) or new CDP session
   * @param method protocol method name
   * @param params default null(ignored), method parameters
   * @param detach default true, whether to detach the CDPSession from target
   */
  sendCDPMessage(method: string, params?: object | null, detach?: boolean): Promise<any>;

  setCookies(cookies: CookieItem[]): Promise<boolean>;

  setExtraHTTPHeaders(headers: Record<string, string>): Promise<boolean>;

  /**
   * set localStorage on the current web page(page.url()) 
   * @param localStorageItems 
   */
  setLocalStroage(localStorageItems: LocalStorageItem[]): Promise<boolean>;

  setPageInfo(pageInfo: UpdatablePageInfo): boolean;

  /**
   * Intercept requests that meet the conditions(requestMatch) to perform an action(action and fulfill).
   * @param options
   */
  setRequestInterception(options: RequestInterceptionOption | RequestInterceptionOption[]): Promise<boolean>;
  /**
   * Intercept responses that meet the conditions(requestMatch and responseMatch) to perform actions(cacheArray and handler )
   * @param options
   */
  setResponseInterception(options: ResponseInterceptionOption | ResponseInterceptionOption[]): Promise<boolean>;

  /**
   * Shortcut for LsdPage.browserContext().setStateData(stateData)
   * @param stateData 
   */
  setStateData(stateData: BrowserStateData): Promise<boolean>;

  /**
   * valid only in puppeteer
   * @param userAgent
   */
  setUserAgent(userAgent: string): Promise<boolean>;

  setViewportSize(viewPortSize: ViewportSize): Promise<boolean>;

  stateData(): Promise<BrowserStateData>;

  status(): PageStatus;

  title(): Promise<string>;

  url(): string;

  /**
   * start to use this free page
   */
  use(): boolean;

  /**
   * 
   * @param selector CSS selector, not XPath
   * @param options 
   */
  waitForElement(selector: string, options?: WaitElementOptions): Promise<boolean>;

  /**
   * 
   * @param options 
   */
  waitForNavigation(options: WaitNavigationOptions): Promise<boolean>;

  /**
   * obj=window?.[key1]...?.[keyn]
   * @return obj ? JSON.stringify(obj) : ""
   * @param keys 
   */
  windowMember(keys: string[]): Promise<string>;

  _origPage(): AllPage;
}

LsdElement

export interface LsdElement {
  ///////////////////////////////////////////////////////////////////////////////    methods used to extract data from the element
  /**
   * 
   * @return the value of a specified attribute on the element
   * @param attributeName 
   */
  attribute(attributeName: string): Promise<string>;
  /**
   * @returns the attribute names of the element
   */
  attributeNames(): Promise<string[]>;
  /**
   * @returns the first element matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   * @param absolute valid only if iframeOptions.length===0
   */
  findElement(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement | null>;
  /**
   * @returns elements matching the given CSS selector or XPath
   * @param selectorOrXpath CSS selector or XPath; if this parameter is an array, each selectorOrXpath in the array will be tried until elements are selected
   * @param iframeOptions default [], options to select decendant frame
   * @param absolute valid only if iframeOptions.length===0
   */
  findElements(selectorOrXpath: string | string[], iframeOptions?: IframeOption[], absolute?: boolean): Promise<LsdElement[]>;
  /**
   * @returns whether the element has the specified attribute or not 
   * @param attributeName 
   */
  hasAttribute(attributeName: string): Promise<boolean>;
  /**
   * @returns the HTML or XML markup contained within the element
   */
  innerHtml(): Promise<string>;

  /**
   * @returns innerText of element
   * @param onlyChild default false, whether to include only the text of the child text nodes
   */
  innerText(onlyChild?: boolean): Promise<string>;
  /**
   * @returns the serialized HTML fragment describing the element including its descendants
   */
  outerHtml(): Promise<string>;
  textContent(): Promise<string>;

  ///////////////////////////////////////////////////////////////////////////////    methods to operate the element(only valid for browser)
  /**
   * Click this element.
   * @param options default {button: "left", count: 1, delay: 0, modifies: []}
   */
  click(options?: MouseClickOptions): Promise<boolean>;
  focus(): Promise<boolean>;
  hover(): Promise<boolean>;
  /**
   * * playwright: fill
   * * puppeteer: type
   */
  input(value: string, options?: InputOptions): Promise<boolean>;
  press(key: KeyInput, options: KeyPressOptions): Promise<boolean>;
  screenshot(options?: ScreenshotOptions): Promise<Buffer>;
  scrollIntoView(): Promise<boolean>;
  select(options: SelectOptions): Promise<boolean>;
  setAttribute(attributeName: string, newValue: string): Promise<boolean>;
  _origElement(): AllElement;
}

LsdBrowserContext

export interface LsdBrowserContext extends EventEmitter {
  /**
   * Get the LsdApiContext associated with this LsdBrowserContext
   * * only vaild in playwright
   */
  apiContext(): LsdApiContext;

  browser(): LsdBrowser;

  close(): Promise<boolean>;

  /**
   * close pages that are free more than maxPageFreeSeconds if maxPageFreeSeconds > 0
   * * but the last page in the browserContext will not be closed
   * @default 0 the default maxPageFreeSeconds of the browserContext will be used
   */
  closeFreePages(maxPageFreeSeconds?: number): Promise<boolean>;

  /**
   * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
   * @param browserContextRequirements
   */
  doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;

  /**
   * get a free page from current pages or by creating a new page
   */
  getPage(always?: boolean): Promise<LsdPage | null>;

  /**
   * whether can get a number of free page(s)
   * * refer to getPage()
   * @param pageNum default 1, the number of free pages
   */
  hasFreePage(pageNum?: number): boolean;

  id(): string;

  isIncognito(): boolean;

  page(pageIdx: number): LsdPage | null;

  pages(): LsdPage[];

  proxy(): ProxyInController | null; // 备用

  setStateData(stateData: BrowserStateData): Promise<boolean>;

  _origBrowserContext(): AllBrowserContext;
}

LsdBrowser

export interface LsdBrowser extends EventEmitter {
  // By default, constructor can be called in LsdBrowserController.launch/connect to create new instance
  // main methods
  newBrowserContext(options?: LsdBrowserContextOptions): Promise<LsdBrowserContext | null>;
  /**
   * 1. launched: close all browserContexts and this browser
   * 2. connected: 
   * * in puppeteer:  close all browserContexts and this browser???
   * * in playwright: only browserContexts created by newContext will be closed, browser is disconnected and browser will not be closed
   */
  close(): Promise<boolean>;

  // other methods
  browserContexts(): LsdBrowserContext[];
  browserControllerType(): BrowserControllerType;
  browserCreationMethod(): BrowserCreationMethod;
  browserType(): LsdBrowserType;

  /**
   * doest this browser meet browserContextRequirements (incognitos ignored in browser)?
   * @param browserContextRequirements
   */
  doesMeetBrowserContextRequirements(browserContextRequirements: BrowserContextRequirements): boolean;

  /**
   * @returns
   * 1. launched: actual executable path
   * 2. connected: exectuablePath in LsdConnectOptions, default ""(unkown)
   */
  executablePath(): string;

  id(): string;
  isConnected(): boolean;
  isHeadless(): boolean;
  options(): LsdLaunchOptions | LsdConnectOptions;
  /**
   * * puppeteer: return pid of connected or launched browser
   * * playwright: return pid of connected browser that is launched manually or using launchServer, or else return 0
   */
  pid(): number;
  /**
   * get the cpu utility(%) and memory usage(MB) of browser processes if pid is greater than 0 (refer to pid())
   */
  pidUsage(): Promise<{ cpu: number, memory: number }>;
  version(): Promise<string>; // playwright: sync; puppeteer: async

  _origBrowser(): AllBrowser;
}

LsdBrowserController

export interface LsdBrowserController {
  /**
   * launch a new browser using related browser controller
   * @param browserControllerType 
   * @param browserType 
   * @param options 
   */
  launch(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdLaunchOptions): Promise<LsdBrowser>;

  /**
   * connect to the current browser using related browser controller
   * @param browserControllerType 
   * @param browserType 
   * @param options 
   */
  connect(browserControllerType: BrowserControllerType, browserType: LsdBrowserType, options?: LsdConnectOptions): Promise<LsdBrowser>;

  /**
   * 
   * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on puppeteer-extra
   */
  setPuppeteerNode(puppeteer: PuppeteerNode | null): boolean;

  /**
   * 
   * @param puppeteer  null means use puppeteer-extra-plugin-stealth based on playwright-extra
   */
  setPlaywrightBrowserType(browserType: LsdBrowserType, playwrightBrowserType: BrowserType | null): boolean;

  /**
   * Create a new LsdApiContext, valid in playwright;
   */
  newApiContext(options?: LsdApiContextOptions): Promise<LsdApiContext>;
}
0.0.40

11 months ago

0.0.41

10 months ago

0.0.42

9 months ago

0.0.43

9 months ago

0.0.44

9 months ago

0.0.45

8 months ago

0.0.46

8 months ago

0.0.47

8 months ago

0.0.38

1 year ago

0.0.39

1 year ago

0.0.51

8 months ago

0.0.52

7 months ago

0.0.53

7 months ago

0.0.54

7 months ago

0.0.50

8 months ago

0.0.48

8 months ago

0.0.49

8 months ago

0.0.37

1 year ago

0.0.20

2 years ago

0.0.21

2 years ago

0.0.22

2 years ago

0.0.23

2 years ago

0.0.24

2 years ago

0.0.25

2 years ago

0.0.15

2 years ago

0.0.16

2 years ago

0.0.17

2 years ago

0.0.18

2 years ago

0.0.19

2 years ago

0.0.30

1 year ago

0.0.31

1 year ago

0.0.32

1 year ago

0.0.10

2 years ago

0.0.33

1 year ago

0.0.11

2 years ago

0.0.34

1 year ago

0.0.12

2 years ago

0.0.35

1 year ago

0.0.13

2 years ago

0.0.36

1 year ago

0.0.14

2 years ago

0.0.3

2 years ago

0.0.26

2 years ago

0.0.9

2 years ago

0.0.27

2 years ago

0.0.8

2 years ago

0.0.28

2 years ago

0.0.29

2 years ago

0.0.5

2 years ago

0.0.4

2 years ago

0.0.7

2 years ago

0.0.6

2 years ago

0.0.2

2 years ago

0.0.1

2 years ago