0.8.0 • Published 5 months ago

dom-smoothie-js v0.8.0

Weekly downloads
-
License
MIT
Repository
-
Last release
5 months ago

DOM-SMOOTHIE-JS

dom-smoothie-js is a nodejs package for extracting readable content from web pages. It is a wrapper around the rust dom_smoothie crate.

Configuration

ParameterTypeDefault ValueDescription
keep_classesbooleanfalseKeep all classes in the document
classes_to_preserveArray<string>[]List of classes to preserve
max_elements_to_parsenumber0Maximum number of elements to parse
disable_json_ldbooleanfalseDisable JSON-LD extraction
n_top_candidatesnumber5Number of top candidates to consider
char_thresholdnumber500Character threshold for content extraction
readable_min_scorenumber (float)20.0Minimum score required for readability check
readable_min_content_lengthnumber140Minimum content length for readability check
candidate_select_mode'Readability' \| 'DomSmoothie''Readability'Candidate selection mode
text_mode'Raw' \| 'Formatted' \| 'Markdown''Raw'Text output mode, either raw, formatted or Markdown

Example Object with Default Parameters

const config = {
  keep_classes: false,
  classes_to_preserve: [],
  max_elements_to_parse: 0,
  disable_json_ld: false,
  n_top_candidates: 5,
  char_threshold: 500,
  readable_min_score: 20.0,
  readable_min_content_length: 140,
  candidate_select_mode: 'Readability',
  text_mode: 'Raw'
};

Examples

import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
  const document_url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
  const cfg = {
    classes_to_preserve: ["caption"],
  }

  // document_url and cfg
  const article = new Readability(content, document_url, cfg).parse();
  console.log("Title:", article.title);
  console.log("Byline:", article.byline);
  console.log("Length:", article.length);
  console.log("Excerpt:", article.excerpt);
  console.log("Site Name:", article.site_name);
  console.log("Dir:", article.dir);
  console.log("Published Time:", article.published_time);
  console.log("Modified Time:", article.modified_time);
  console.log("Image:", article.image);
  // This uri can be taken only from ld+json
  console.log("URL:", article.url);

  // Skipping article.content since it is too large.
  //console.log("HTML Content:", article.content);

  // Skipping article.text_content since it is too large.
  //console.log("Text Content:", article.text_content);
}

main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");


  // You can parse only the metadata without parsing the article content.
  const readability = new Readability(content, null, null);

  // Parse only the title without extracting the full content.
  const title = readability.get_article_title();
  console.log("Title:", title);

  // However, this title may differ from `metadata.title`,
  // as `metadata.title` first attempts to extract the title from the metadata
  // and falls back to `Readability::get_article_title` if unavailable.

}

main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    disable_json_ld: false,
  };

  // You can parse only metadata without parsing the article content
  const readability = new Readability(content, null, cfg);

  // <script type="application/ld+json"> may contain some useful information,
  // but usually it is not enough.
  const ld_meta = readability.parse_json_ld();

  console.log("LD META:", ld_meta);

  // Under the hood, `Readability::parse` passes the metadata obtained from `Readability::parse_json_ld`
  // as the basis to `Readability::get_article_metadata`. But this is not necessary.
  const meta = readability.get_article_metadata(ld_meta);

  console.log("META:", meta);

  // Some fields of Metadata may be missing because they can be assigned
  // during the Readability::parse process.
  // This applies to `excerpt`, `byline`, and `dir`.
}

main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  // you can specify optional parameters for `Readability.is_probably_readable`.
  const cfg = {
    readable_min_score: 20.0,
    readable_min_content_length: 140,
  };

  const readability = new Readability(content, null, cfg);

  // There is a way to perform a quick check to determine
  // if the document is readable before cleaning and parsing it.
  // After calling `Readability::parse`, it may show different results,
  // but calling it after parsing would be nonsensical.
  if (readability.is_probably_readable()) {
    let article = readability.parse();
    console.log("Title:", article.title);
    console.log("Byline:", article.byline);
    console.log("Site Name:", article.site_name);
    console.log("URL:", article.url);
    // and so on...
  }
}

main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    candidate_select_mode: "DomSmoothie",
  };

  const readability = new Readability(content, null, cfg);

  const article = readability.parse();
  console.log("Text Content:", article.text_content);
}

main();

By default, the text content is output as-is, without formatting, preserving whitespace from the original HTML document. Depending on the document's initial markup, this can be quite verbose and inconvenient.

To retrieve formatted text content, set text_mode: TextMode::Formatted in the config. This formatting does not preserve table structures, meaning table data may be output as plain text without column alignment. While this formatting is not as structured as Markdown, it provides a cleaner output compared to raw text.

TextMode::Markdown enables Markdown formatting.

import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";

function main() {
  const content = readFileSync("test_data/rustwiki_2024.html", "utf8");

  const cfg = {
    text_mode: "Formatted",
    //text_mode: "Markdown",
  };

  const readability = new Readability(content, null, cfg);

  const article = readability.parse();
  console.log("Text Content:", article.text_content);
}

main();

License

Licensed under MIT (LICENSE or http://opensource.org/licenses/MIT).

0.8.0

5 months ago

0.7.0

5 months ago

0.6.1

6 months ago

0.6.0

6 months ago

0.5.1

6 months ago