0.1.1 • Published 4 years ago

@unified-doc/hast-util-extract-text-offsets v0.1.1

Weekly downloads
-
License
MIT
Repository
github
Last release
4 years ago

hast-util-extract-text-offsets

hast utility to extract text-based offsets and unist positions of text nodes.

Install

yarn add @unified-doc/hast-util-extract-text-offsets

Description

This utility goes through all text nodes in a hast tree and extracts text offsets into a provided extractor callback.

A TextOffset contains the following information:

  • position: the unist position of the text node.
  • startOffset: The offset of the text node's string content relative to the tree's string content.
  • endOffset: The offset of the text node's string content relative to the tree's string content.
  • isNewline: if the text content is a newline character.

The terms and concepts above are better presented with the following example:

const content = '<blockquote>this is a blockquote with <b>bold</bold> text';
const tree = {
	type: 'root',
	children: [
		type: 'element',
		tagName: 'blockquote',
		position: Position1,
		children: [
			{
				type: 'text',
				position: Position2,
				value: 'this is a blockquote with ',
			},
			{
				type: 'element',
				tagName: 'b',
				position: Position3,
				children: [
					{
						type: 'text',
						value: 'bold',
						position: Position4,
					},
				]
			},
			{
				type: 'text',
				value: ' text',
				position: Position5,
			}
		]
	]
}
const stringifiedTree = 'this is a blockquote with bold text';

// startOffset and endOffset of text offsets are calculated against stringifiedTree
const extractedTextOffsets = [
	{
		// 'this is a blockquote with '
		startOffset: 0,
		endOffset: 25,
		position: Position1,
	},
	{
		// 'bold'
		startOffset: 25,
		endOffset: 28,
		position: Position4,
	},
	{
		// 'text'
		startOffset: 28,
		endOffset: 31,
		position: Position5,
	},
]

Use

import extractTextOffsets from '@unified-doc/hast-util-extract-text-offsets';

const tree = {
	type: 'element',
	tagName: 'div',
	children: [
		{
			type: 'element',
			tagName: 'h1',
			children: [
				{
					type: 'text',
					position: 'MockUnistPosition1',
					value: 'h1',
				},
			],
		},
		{
			type: 'element',
			tagName: 'div',
			children: [
				{
					type: 'text',
					position: 'MockUnistPosition2',
					value: 'div',
				},
				{
					type: 'element',
					tagName: 'a',
					children: [
						{
							type: 'text',
							position: 'MockUnistPosition3',
							value: 'a',
						},
					],
				},
			],
		},
		{
			type: 'element',
			tagName: 'h2',
			children: [
				{
					type: 'text',
					value: 'h2',
				},
			],
		},
	],
};

function extractor(textOffsets) {
	console.log(textOffsets);
}

extractTextOffsets(tree, extractor);

Yields:

[
	{
		"startOffset": 0,
		"endOffset": 2,  // 0 + 'h1'.length
		"position": "MockUnistPosition1",
	},
	{
		"startOffset": 2,  // previous text endOffset
		"endOffset": 5,  // 2 + 'div'.length
		"position": "MockUnistPosition2",
	},
	{
		"startOffset": 5,  // previous text endOffset
		"endOffset": 6,  // 5 + 'a'.length
		"position": "MockUnistPosition3",
	}
	// Text node with value 'h2' is not extracted because it does not have a source unist position
];

API

function textOffsets(tree: Node, extractor: Extractor): Node;

Extracts and captures an array of TextOffset with an Extractor callback. Returns the original unmodified tree.

Types

import { Position } from 'unist';

interface TextOffset {
	startOffset: number;
	endOffset: number;
	position: Position;
	isNewline?: boolean;
}

type Extractor = (textOffsets: TextOffset[]) => void;

export default ({ children }) => children