dom-smoothie-js v0.8.0
DOM-SMOOTHIE-JS
dom-smoothie-js
is a nodejs package for extracting readable content from web pages. It is a wrapper around the rust dom_smoothie crate.
Configuration
Parameter | Type | Default Value | Description |
---|---|---|---|
keep_classes | boolean | false | Keep all classes in the document |
classes_to_preserve | Array<string> | [] | List of classes to preserve |
max_elements_to_parse | number | 0 | Maximum number of elements to parse |
disable_json_ld | boolean | false | Disable JSON-LD extraction |
n_top_candidates | number | 5 | Number of top candidates to consider |
char_threshold | number | 500 | Character threshold for content extraction |
readable_min_score | number (float) | 20.0 | Minimum score required for readability check |
readable_min_content_length | number | 140 | Minimum content length for readability check |
candidate_select_mode | 'Readability' \| 'DomSmoothie' | 'Readability' | Candidate selection mode |
text_mode | 'Raw' \| 'Formatted' \| 'Markdown' | 'Raw' | Text output mode, either raw, formatted or Markdown |
Example Object with Default Parameters
const config = {
keep_classes: false,
classes_to_preserve: [],
max_elements_to_parse: 0,
disable_json_ld: false,
n_top_candidates: 5,
char_threshold: 500,
readable_min_score: 20.0,
readable_min_content_length: 140,
candidate_select_mode: 'Readability',
text_mode: 'Raw'
};
Examples
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
const document_url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
const cfg = {
classes_to_preserve: ["caption"],
}
// document_url and cfg
const article = new Readability(content, document_url, cfg).parse();
console.log("Title:", article.title);
console.log("Byline:", article.byline);
console.log("Length:", article.length);
console.log("Excerpt:", article.excerpt);
console.log("Site Name:", article.site_name);
console.log("Dir:", article.dir);
console.log("Published Time:", article.published_time);
console.log("Modified Time:", article.modified_time);
console.log("Image:", article.image);
// This uri can be taken only from ld+json
console.log("URL:", article.url);
// Skipping article.content since it is too large.
//console.log("HTML Content:", article.content);
// Skipping article.text_content since it is too large.
//console.log("Text Content:", article.text_content);
}
main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
// You can parse only the metadata without parsing the article content.
const readability = new Readability(content, null, null);
// Parse only the title without extracting the full content.
const title = readability.get_article_title();
console.log("Title:", title);
// However, this title may differ from `metadata.title`,
// as `metadata.title` first attempts to extract the title from the metadata
// and falls back to `Readability::get_article_title` if unavailable.
}
main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
const cfg = {
disable_json_ld: false,
};
// You can parse only metadata without parsing the article content
const readability = new Readability(content, null, cfg);
// <script type="application/ld+json"> may contain some useful information,
// but usually it is not enough.
const ld_meta = readability.parse_json_ld();
console.log("LD META:", ld_meta);
// Under the hood, `Readability::parse` passes the metadata obtained from `Readability::parse_json_ld`
// as the basis to `Readability::get_article_metadata`. But this is not necessary.
const meta = readability.get_article_metadata(ld_meta);
console.log("META:", meta);
// Some fields of Metadata may be missing because they can be assigned
// during the Readability::parse process.
// This applies to `excerpt`, `byline`, and `dir`.
}
main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
// you can specify optional parameters for `Readability.is_probably_readable`.
const cfg = {
readable_min_score: 20.0,
readable_min_content_length: 140,
};
const readability = new Readability(content, null, cfg);
// There is a way to perform a quick check to determine
// if the document is readable before cleaning and parsing it.
// After calling `Readability::parse`, it may show different results,
// but calling it after parsing would be nonsensical.
if (readability.is_probably_readable()) {
let article = readability.parse();
console.log("Title:", article.title);
console.log("Byline:", article.byline);
console.log("Site Name:", article.site_name);
console.log("URL:", article.url);
// and so on...
}
}
main();
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
const cfg = {
candidate_select_mode: "DomSmoothie",
};
const readability = new Readability(content, null, cfg);
const article = readability.parse();
console.log("Text Content:", article.text_content);
}
main();
By default, the text content is output as-is, without formatting, preserving whitespace from the original HTML document. Depending on the document's initial markup, this can be quite verbose and inconvenient.
To retrieve formatted text content, set text_mode: TextMode::Formatted
in the config.
This formatting does not preserve table structures, meaning table data may be output as plain text without column alignment.
While this formatting is not as structured as Markdown, it provides a cleaner output compared to raw text.
TextMode::Markdown
enables Markdown formatting.
import { Readability } from "dom-smoothie-js";
import { readFileSync } from "node:fs";
function main() {
const content = readFileSync("test_data/rustwiki_2024.html", "utf8");
const cfg = {
text_mode: "Formatted",
//text_mode: "Markdown",
};
const readability = new Readability(content, null, cfg);
const article = readability.parse();
console.log("Text Content:", article.text_content);
}
main();
License
Licensed under MIT (LICENSE or http://opensource.org/licenses/MIT).