Tnthai NPM | npm.io

TNThai or TN-Thai Analyzer

TN-Thai analyzer is a Thai Word segmentation module in Node.js. The segmentation algorithm is a kind of dictionary-based segmentation. Internally, the analyzer contains two segmentation algorithm named Safe and Unsafe segmentation (In Thai , soon will be in English version). The library uses Trie data structure in Double-array implementation to store Thai words. The Thai-word dictionary is coming from Lexitron (NECTEC) and Swath Program.

Installation

npm install tnthai

npm install tnthai --save

The basic usage

tnthai = require('tnthai')
var analyzer = new tnthai()

analyzer.segmenting("สวัสดีชาวโลก")
// { solution : ['สวัสดี', 'ชาวโลก'] }

analyzer.segmenting("สองสาวสุดแสนสวยใส่เสื้อสีแสดสวมสร้อยสี่แสนสามสิบเส้นส้นสูง")
// { solution: [ 'สอง', 'สาว', 'สุด', 'แสน', 'สวย', 'ใส่', 'เสื้อ', 'สี'
, 'แสด','สวม', 'สร้อย', 'สี่', 'แสน', 'สาม', 'สิบ', 'เส้น', 'ส้นสูง'] }

Filter stopword in the segmented result

analyzer.segmenting("เราคนหนึ่งคนนั้น ในวันหนึ่งวันนั้นเรายังผูกพันกันมากมาย"
, {filterStopword : true})
// {solution: [ 'คน', 'คน', ' ', 'วันหนึ่ง', 'ผูกพัน', 'มากมาย' ]}

analyze Thai and English (but not so smart)

analyzer.segmenting("สวัสดีชาวโลก Hello World!!")
// {solution: [ 'สวัสดี', 'ชาวโลก', ' ', 'Hello', ' ', 'World', '!!' ]}

give multiple Solution in segmentation

analyzer.segmenting("คนแก่ขนของ", {multiSolution : true})
// { solution: 
//   [ [ 'คนแก่', 'ขนของ' ],
//     [ 'คนแก่', 'ขน', 'ของ' ],
//     [ 'คน', 'แก่', 'ขนของ' ],
//     [ 'คน', 'แก่', 'ขน', 'ของ' ] ] }

unsafe segment in case of misspell occur in the input sentences

//misspell input
analyzer.segmenting("คนแก่สขนของ", {multiSolution : true})
// { solution: [ [ 'คนแก่', 'ส', 'ขนของ' ] ] }

Applications of thai word segmentation: 1. analyzing a sentence for keywords to query in databases : TNThaiAnalyzer 2. language modeling for generating a based-word list relatively to the documents in databases for spell correction

gitlab url : https://gitlab.thinknet.co.th/prapeepat/TNThaiAnalyzer

Up-coming features : In version 1.1.0, there will be POS (Parts of speech) tagging feature using Probabilistic N-gram with Orchid corpus. The example usage will be like followed:

analyzer.segmenting("คนแก่ขนของ", {multiSolution : true, POSTagging : true})
// { solution: 
//   [ [ {'คนแก่', 'NPRP'}, {'ขนของ', 'VACT'} ],
//     [ {'คนแก่', 'NPRP'}, {'ขน', 'VACT'}, {'ของ', 'NCMN'} ],
//     [ {'คน', 'NCMN'}, {'แก่', 'VATT'}, {'ขนของ', 'VACT'} ],
//     [ {'คน', 'NCMN'}, {'แก่', 'VATT'}, {'ขน', 'VACT'}, {'ของ', 'NCMN'} ] ] }
//      NPRP ~ Proper noun , VACT ~ Active verb, NCMN ~ Common noun, VATT ~ Attributive verb

the detail of POSTagging can be found here

opened to have feedback from you guys!!!

Thai Word Segmentation Text Analyzing

assert doublearray fs markov-chains stopwords-th

@infinitebrahmanuniverse/nolb-tn manee

7 years ago

7 years ago

8 years ago