Little-lexer NPM

Little lexer

This is a basic lexer written in javascript

Installation

yarn add little-lexer

Usage

import Lexer from 'little-lexer';

const lex = Lexer();
const result = lex('if var1 = 2.25 then x');

By default every character in the string will lex to a seperate token. So the result will be a token for every character in the string.

[
	{ type: 'i', lexeme: 'i' },
	{ type: 'f', lexeme: 'f' },
	{ type: ' ', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: '.', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: ' ', lexeme: ' ' },
	{ type: '=', lexeme: '=' },
	{ type: ' ', lexeme: ' ' },
	{ type: '2', lexeme: '2' },
	{ type: '.', lexeme: '.' },
	{ type: '2', lexeme: '2' },
	{ type: '5', lexeme: '5' },
	{ type: ' ', lexeme: ' ' },
	{ type: 't', lexeme: 't' },
	{ type: 'h', lexeme: 'h' },
	{ type: 'e', lexeme: 'e' },
	{ type: 'n', lexeme: 'n' },
	{ type: ' ', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: '1', lexeme: '1' },
];

An optional object can be passed to the Lexer to specify your own token types.

import Lexer from 'little-lexer';

const matches = {
	' ': 'space',
	'=': 'equals',
};

const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');

/* [
	{ type: 'i', lexeme: 'i' },
	{ type: 'f', lexeme: 'f' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: '.', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: '2', lexeme: '2' },
	{ type: '.', lexeme: '.' },
	{ type: '2', lexeme: '2' },
	{ type: '5', lexeme: '5' },
	{ type: 'space', lexeme: ' ' },
	{ type: 't', lexeme: 't' },
	{ type: 'h', lexeme: 'h' },
	{ type: 'e', lexeme: 'e' },
	{ type: 'n', lexeme: 'n' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: '1', lexeme: '1' },
]; */

The keys can also match multiple characters:

import Lexer from 'little-lexer';

const matches = {
	if: 'keyword',
	then: 'keyword',
	' ': 'space',
	'=': 'equals',
};

const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');

/*

[
	{ type: 'keyword', lexeme: 'if' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: '.', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: '2', lexeme: '2' },
	{ type: '.', lexeme: '.' },
	{ type: '2', lexeme: '2' },
	{ type: '5', lexeme: '5' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'keyword', lexeme: 'then' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: '1', lexeme: '1' }
];

*/

The keys are used as regular expressions. So you can use regexes to specify more complex matches.

import Lexer from 'little-lexer';

const matches = {
	if: 'keyword',
	then: 'keyword',
	' ': 'space',
	'=': 'equals',
	'[0-9]': 'number',
};

const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');

/*

[
	{ type: 'keyword', lexeme: 'if' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: '.', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'number', lexeme: '2' },
	{ type: '.', lexeme: '.' },
	{ type: 'number', lexeme: '2' },
	{ type: 'number', lexeme: '5' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'keyword', lexeme: 'then' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'number', lexeme: '1' }
];

*/

The keys being regexes also means that you have to escape characters with a special regex meaning. Not escaping the . character in the next example would not yield the result we want, as the . represents any possible character in a regex. So it would always result in a match for any character instead of matching just the . character.

import Lexer from 'little-lexer';

const matches = {
	if: 'keyword',
	then: 'keyword',
	' ': 'space',
	'=': 'equals',
	'\\.': 'dot',
	'[0-9]': 'number',
};

const lex = Lexer(matches);
const result = lex('if var1 = 2.25 then x');

/*

[
	{ type: 'keyword', lexeme: 'if' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: 'dot', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'number', lexeme: '2' },
	{ type: 'dot', lexeme: '.' },
	{ type: 'number', lexeme: '2' },
	{ type: 'number', lexeme: '5' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'keyword', lexeme: 'then' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'number', lexeme: '1' }
];

*/

Sometimes the lexer needs to have different states. An optional object can be passed to the Lexer function as a second argument to specify other states.

If we encountered a digit for example, and would like to have the . character to be lexed as part of the floating number and not a seperate 'dot' token, we should specify a new state. So we introduce a new "number" state, also with a set of matches to specify state transitions. While in a custom state, the lexer only uses the matches specified in that state. In the next example, if the lexer encounters a digit 0-9 it transitions to the "number" state. While being in the number state, only 0-9\\. is tested and when matched, keeps the lexer in the number state. If no match was found, the lexer transitions back to the starting state and a token is generated.

import Lexer from 'little-lexer';

const matches = {
	if: 'keyword',
	then: 'keyword',
	' ': 'space',
	'=': 'equals',
	'\\.': 'dot',
	'[0-9]': 'number',
};

const states = {
	number: {
		'[0-9\\.]': 'number',
	},
};

const lex = Lexer(matches, states);
const result = lex('if var1 = 2.25 then x');

/*

[
	{ type: 'keyword', lexeme: 'if' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'b', lexeme: 'b' },
	{ type: 'j', lexeme: 'j' },
	{ type: 'dot', lexeme: '.' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'o', lexeme: 'o' },
	{ type: 'p', lexeme: 'p' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'number', lexeme: '2.25' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'keyword', lexeme: 'then' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'v', lexeme: 'v' },
	{ type: 'a', lexeme: 'a' },
	{ type: 'r', lexeme: 'r' },
	{ type: 'number', lexeme: '1' }
];

*/

If we want to allow lexing of identifiers that contain, but not start with, numbers and capital letters, we can again introduce a new state for lexing identifiers. Only a lowercase a-z transitions to the "identifier" state. When in the "identifier" state, a-zA-Z0-9 keeps the lexer in the "identifier" state.

import Lexer from 'little-lexer';

const matches = {
	if: 'keyword',
	then: 'keyword',
	' ': 'space',
	'=': 'equals',
	'\\.': 'dot',
	'[a-z]': 'identifier',
	'[0-9]': 'number',
};

const states = {
	number: {
		'[0-9\\.]': 'number',
	},
	identifier: {
		'[a-zA-Z0-9]': 'identifier',
	},
};

const lex = Lexer(matches, states);
const result = lex('if var1 = 2.25 then x');

/*

[
	{ type: 'keyword', lexeme: 'if' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'identifier', lexeme: 'obj' },
	{ type: 'dot', lexeme: '.' },
	{ type: 'identifier', lexeme: 'prop' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'equals', lexeme: '=' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'number', lexeme: '2.25' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'keyword', lexeme: 'then' },
	{ type: 'space', lexeme: ' ' },
	{ type: 'identifier', lexeme: 'var1' }
];

*/

@everything-registry/sub-chunk-2084 @zalastax/nolb-litt @infinitebrahmanuniverse/nolb-litt

1.0.1

6 years ago

1.0.0

6 years ago