0.0.5 • Published 1 year ago

khmertokenizer v0.0.5

Weekly downloads
-
License
MIT
Repository
github
Last release
1 year ago

Khmer Tokenizer

A fast Khmer text tokenizer that ensures all characters are included in the process.

Web demo

import { tokenize } from 'khmertokenizer';

tokenize("ភាសាខ្មែរ១២ 123 ABC")
// => ["ភា","សា","ខ្មែ","រ","១","២"," ","1","2","3"," ","A","B","C"]

Iterator

import { tokenizeAsIterator } from 'khmertokenizer';

for (const c of tokenizeAsIterator("ភាសាខ្មែរ១២ 123 ABC")) {
  console.log(c);
}

Grapheme Validation

import { tokenize, isInvalidKhmerGrapheme } from 'khmertokenizer';

const input = "ភាសាខ្មែរ១២ 123 ABC ២ ៗាា"
const output = tokenize(input)
  .filter(c => !isInvalidKhmerGrapheme(c)) // remove invalid graphemes
  .join("")

//=> "ភាសាខ្មែរ១២ 123 ABC ២ ៗ"
0.0.5

1 year ago

0.0.4

1 year ago

0.0.3

1 year ago

0.0.2

1 year ago

0.0.1

1 year ago