-
It took me three days to implement the g4 grammar using Chevrotain, but the performance turned out to be significantly worse than before. Antlr4antlr4grammar Formula;
// antlr的分析器定义文件
//////////////////// parser rules:
// The entry point of the grammar
formula
: expr
;
// Defines expressions based on operator precedence similar to JavaScript
// 运算符优先级按照JS的风格来,具体参考(https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Operators/Operator_Precedence)
expr
: FIELD # field // 字段 (e.g., #field.name#)
| INT # int // 整数 (e.g., 123)
| DOUBLE # double // 浮点数 (e.g., 123.45)
| String # str // 字符串 (e.g., "text")
| op=(TRUE | FALSE) # bool // 布尔值 (true/false)
| NULL # null // 空值 (null)
| LPAREN expr RPAREN # parens // 括号 (e.g., (a + b))
| ID LPAREN (expr (COMMA expr)*)? RPAREN # func // 函数调用 (e.g., SUM(a, b))
| '[' (expr (COMMA expr)*)? ']' # list // 数组 (e.g., [1, 2, 3])
| op=MINUS expr # unaryOperator // 一元运算符:负号(-)
| expr op=(MULTIPLY | DIVIDE) expr # mulDiv // 乘除法 (e.g., a * b or a / b)
| expr op=(PLUS | MINUS) expr # plusMinus // 加减法 (e.g., a + b or a - b)
| expr op=(EQ | NEQ | LT | LE | GT | GE) expr # compare // 比较符 (e.g., a > b, a == b)
| expr expr # error // 错误 (invalid expressions)
;
//////////////////// lexer rules:
// Arithmetic and comparison operators
PLUS : '+'; // Addition
MINUS : '-'; // Subtraction
MULTIPLY : '*'; // Multiplication
DIVIDE : '/'; // Division
// Parentheses for grouping expressions
LPAREN : '(' | '('; // Left parenthesis (including full-width version)
RPAREN : ')' | ')'; // Right parenthesis (including full-width version)
// Comparison operators
EQ : '=='; // Equal
NEQ : '!='; // Not equal
LT : '<'; // Less than
LE : '<='; // Less than or equal to
GT : '>'; // Greater than
GE : '>='; // Greater than or equal to
// Boolean and null literals
FALSE : 'FALSE' | 'false'; // Boolean false
TRUE : 'TRUE' | 'true'; // Boolean true
NULL : 'NULL' | 'null'; // Null value
// Comma as a separator (supports both half-width and full-width)
COMMA : ',' | ','; // Comma separator
// Identifier rule for variables and fields
ID
: [$_A-Za-z\u4e00-\u9fa5][_A-Za-z0-9\u4e00-\u9fa5]* | '[' [0-9*]+ ']'
; // Identifiers can include letters, numbers, and underscores
// Field rule, supporting both hashed and nested formats
FIELD
: '#' ID ('.' ID)* '#' | ID ('.' ID)*
; // e.g., #field.name# or field.name
// Integer literals
INT
: '0' | [1-9][0-9]*
; // e.g., 0, 123, 456
// Floating-point literals
DOUBLE
: ('0' | [1-9][0-9]*)('.'[0-9]*)?
; // e.g., 0.1, 123.45
// Whitespace, ignored during parsing
WS
: [ \t\r\n]+ -> skip
; // Skips spaces, tabs, and newlines
// String literals (supports both double quotes and Chinese quotation marks)
String
: '"' (StringCharacter+)? '"' | '“' (StringCharacter+)? '”'
; // e.g., "text", “text”
// Allowed characters within strings
StringCharacter
: ~["“”\\] | EscapeSequence
; // Any character except quotes or backslashes, or an escape sequence
// Escape sequences for strings
EscapeSequence
: '\\' [btnfr"“”'\\] | UnicodeEscape
; // e.g., \n, \t, \" or Unicode sequences
// Unicode escape sequence
UnicodeEscape
: '\\' 'u' HexDigit HexDigit HexDigit HexDigit
; // e.g., \u1234
// Hexadecimal digit
HexDigit
: [0-9a-fA-F]
; // Hexadecimal characters (0-9, A-F)Chevrontain v10.5.0chevrotain
import { createToken, Lexer } from 'chevrotain';
/**
* -----------------------------------------
* Arithmetic and comparison operators
* -----------------------------------------
*/
export const PLUS_MINUS_OPS = createToken({
name: 'PLUS_MINUS_OPS',
pattern: Lexer.NA,
});
// Plus sign, e.g., 1 + 2
export const PLUS = createToken({ name: 'PLUS', pattern: /\+/, categories: PLUS_MINUS_OPS }); // Addition
// Minus sign, e.g., 1 - 2
export const MINUS = createToken({ name: 'MINUS', pattern: /-/, categories: PLUS_MINUS_OPS }); // Subtraction
export const MUL_DIV_OPS = createToken({
name: 'MUL_DIV_OPS',
pattern: Lexer.NA,
});
// Multiplication sign, e.g., 1 * 2
export const MULTIPLY = createToken({ name: 'MULTIPLY', pattern: /\*/, categories: MUL_DIV_OPS }); // Multiplication
// Division sign, e.g., 1 / 2
export const DIVIDE = createToken({ name: 'DIVIDE', pattern: /\//, categories: MUL_DIV_OPS }); // Division
/**
* -----------------------------------------
* Parentheses for grouping expressions
* (includes full-width versions)
* -----------------------------------------
*/
// Left parenthesis '(' or '('
export const LPAREN = createToken({ name: 'LPAREN', pattern: /\(|(/ }); // Left parenthesis
// Right parenthesis ')' or ')'
export const RPAREN = createToken({ name: 'RPAREN', pattern: /\)|)/ }); // Right parenthesis
/**
* -----------------------------------------
* Brackets for indexing or grouping
* -----------------------------------------
*/
// Left square bracket '['
export const LBRACKET = createToken({ name: 'LBRACKET', pattern: /\[/ }); // Left bracket
// Right square bracket ']'
export const RBRACKET = createToken({ name: 'RBRACKET', pattern: /\]/ }); // Right bracket
/**
* -----------------------------------------
* Comparison operators
* -----------------------------------------
*/
export const COMPARE_OPS = createToken({
name: 'COMPARE_OPS',
pattern: Lexer.NA,
});
// Less than or equal '<='
export const LE = createToken({ name: 'LE', pattern: /<=/, line_breaks: false, categories: COMPARE_OPS }); // Less Than or Equal
// Greater than or equal '>='
export const GE = createToken({ name: 'GE', pattern: />=/, line_breaks: false, categories: COMPARE_OPS }); // Greater Than or Equal
// Less than '<'
export const LT = createToken({ name: 'LT', pattern: /</, line_breaks: false, categories: COMPARE_OPS }); // Less Than
// Greater than '>'
export const GT = createToken({ name: 'GT', pattern: />/, line_breaks: false, categories: COMPARE_OPS }); // Greater Than
// Equal '=='
export const EQ = createToken({ name: 'EQ', pattern: /==/, line_breaks: false, categories: COMPARE_OPS }); // Equal
// Not equal '!='
export const NEQ = createToken({ name: 'NEQ', pattern: /!=/, line_breaks: false, categories: COMPARE_OPS }); // Not Equal
/**
* -----------------------------------------
* Boolean and null literals
* -----------------------------------------
*/
// Boolean true, matches 'TRUE' or 'true'
export const TRUE = createToken({ name: 'TRUE', pattern: /TRUE|true/, line_breaks: false }); // Boolean true
// Boolean false, matches 'FALSE' or 'false'
export const FALSE = createToken({ name: 'FALSE', pattern: /FALSE|false/, line_breaks: false }); // Boolean false
// Null value, matches 'NULL' or 'null'
export const NULL = createToken({ name: 'NULL', pattern: /NULL|null/, line_breaks: false }); // Null value
/**
* -----------------------------------------
* Comma as a separator (supports both half-width and full-width)
* -----------------------------------------
*/
// Comma separator, matches ',' or ','
export const COMMA = createToken({ name: 'COMMA', pattern: /,|,/ }); // Comma separator
/**
* -----------------------------------------
* Identifier rule for variables and fields
* -----------------------------------------
*/
// Define DOT token for matching '.' character
export const DOT = createToken({ name: 'DOT', pattern: /\./ });
// Define a regular expression for identifiers (ID_PATTERN)
// 1) Normal form: First character can be $_A-Za-z or Chinese, followed by $_A-Za-z0-9 or Chinese
// 2) Bracket form: [1], [2], [*], etc.
export const ID_PATTERN =
'(?:' + // Non-capturing group to handle two cases
// 1. Normal identifier: starts with [$_A-Za-z or Chinese], followed by [$_A-Za-z0-9 or Chinese]*
'[$_A-Za-z\\u4E00-\\u9FA5]' +
'[$_A-Za-z0-9\\u4E00-\\u9FA5]*' +
// Or
'|' +
// 2. Bracketed identifier: '[' followed by any combination of numbers or '*', ending with ']'
'\\[' +
'[0-9*]+' +
'\\]' +
')';
// Generate ID token for normal variable names or forms like [1]
export const ID = createToken({ name: 'ID', pattern: new RegExp(ID_PATTERN), line_breaks: false }); // Use helper pattern for ID
/**
* -----------------------------------------
* Field rule, reusing ID pattern
* -----------------------------------------
*/
// Define FIELD regular expression (FIELD_PATTERN):
// 1) Wrapped in hashes: #ID(.ID)*#
// 2) Without hashes: ID(.ID)*
// ".ID" indicates multiple sub-IDs connected with '.' (nested fields)
export const FIELD_PATTERN =
'(?:' + // Non-capturing group to handle two field formats
// 1. Wrapped in hashes
'#' +
ID_PATTERN +
'(?:' + // Non-capturing group for zero or more ".ID"
'\\.' +
ID_PATTERN +
')*' +
'#' +
// Or
'|' +
// 2. Regular nested fields
ID_PATTERN +
'(?:' +
'\\.' +
ID_PATTERN +
')*' +
')';
// Declare FIELD token; matches longer strings in preference to ID
export const FIELD = createToken({
name: 'FIELD',
pattern: new RegExp(FIELD_PATTERN),
longer_alt: ID,
}); // Reuse ID pattern in FIELD definition
/**
* -----------------------------------------
* Integer literals
* -----------------------------------------
*/
// Integer, e.g., 123
export const INT = createToken({ name: 'INT', pattern: /\d+/, line_breaks: false }); // Integer literals
/**
* -----------------------------------------
* Floating-point literals
* -----------------------------------------
*/
// Floating-point number, e.g., 123.45
export const DOUBLE = createToken({
name: 'DOUBLE',
pattern: /\d+\.\d+/,
line_breaks: false,
}); // Floating-point literals
/**
* -----------------------------------------
* String-related tokens
*
* Demonstrates how to reuse sub-patterns step by step:
* HexDigit -> UnicodeEscape -> EscapeSequence -> StringCharacter -> STRING
* -----------------------------------------
*/
// Hexadecimal characters [0-9a-fA-F]
export const HEX_DIGIT_PATTERN = '[0-9a-fA-F]';
// UnicodeEscape: Format \u1234 (backslash + 'u' + 4 HexDigits)
export const UNICODE_ESCAPE_PATTERN =
'\\u' + // Double backslashes represent a single backslash character
'' +
HEX_DIGIT_PATTERN +
'{4}';
// EscapeSequence: Includes \uXXXX or regular escape characters (\n, \t, \" etc.)
export const ESCAPE_SEQUENCE_PATTERN =
'\\\\[btnfr"“”\'\\\\]' + // Regular escapes, e.g., \n, \t, \", ...
'|' +
UNICODE_ESCAPE_PATTERN; // Include UnicodeEscape
/**
* StringCharacter represents a single character within a string:
* - Regular character (not quotes or backslash)
* - Or an escape sequence (EscapeSequence)
*/
export const STRING_CHARACTER_PATTERN =
'[^"“”\\\\]' + // Non-quote, non-backslash
'|' +
ESCAPE_SEQUENCE_PATTERN; // Reuse EscapeSequence
/**
* STRING_PATTERN:
* Supports both regular English quotes "..." and Chinese quotes “...”.
* Can contain multiple StringCharacters.
*/
const STRING_PATTERN =
'(?:' +
// English quotes:
'"' +
'(?:' +
STRING_CHARACTER_PATTERN +
')*' +
'"' +
')' +
'|' +
'(?:' +
// Chinese quotes:
'“' +
'(?:' +
STRING_CHARACTER_PATTERN +
')*' +
'”' +
')';
// Declare StringLiteral token to match complete strings
export const STRING = createToken({
name: 'STRING',
pattern: new RegExp(STRING_PATTERN),
line_breaks: true,
start_chars_hint: ['"“'],
});
/**
* -----------------------------------------
* Whitespace, ignored during parsing
* -----------------------------------------
*/
// Matches whitespace characters [ \t\r\n]+, skipped during parsing
export const WS = createToken({
name: 'WS',
pattern: /\s+/,
group: Lexer.SKIPPED,
}); // Skips spaces, tabs, and newlines
/**
* -----------------------------------------
* Export all tokens as an array
* in the order specified in the .g4 file
* -----------------------------------------
*/
export const allTokens = [
// Whitespace, ignored during parsing
WS,
// Comparison operators
EQ,
NEQ,
LE,
GE,
LT,
GT,
// Arithmetic and comparison operators
PLUS,
MINUS,
MULTIPLY,
DIVIDE,
// Parentheses for grouping expressions
LPAREN,
RPAREN,
LBRACKET,
RBRACKET,
// Comma as a separator (supports both half-width and full-width)
COMMA,
// Identifier rule for variables and fields
DOT,
// Boolean and null literals
TRUE,
FALSE,
NULL,
ID,
// Field rule, supporting both hashed (#) and nested formats
FIELD,
// Floating-point literals
DOUBLE,
// Integer literals
INT,
// String literals (supports both double quotes and Chinese quotation marks)
STRING,
// StringCharacter,
// EscapeSequence,
// UnicodeEscape,
// HexDigit,
PLUS_MINUS_OPS,
MUL_DIV_OPS,
COMPARE_OPS,
];import { IOrAlt, IParserConfig, OrMethodOpts } from '@chevrotain/types';
import { CstNode, CstParser, EOF, ParserMethod } from 'chevrotain';
import {
COMMA,
COMPARE_OPS,
DOT,
DOUBLE,
FALSE,
FIELD,
ID,
INT,
LBRACKET,
LPAREN,
MINUS,
MUL_DIV_OPS,
NULL,
PLUS,
PLUS_MINUS_OPS,
RBRACKET,
RPAREN,
STRING,
TRUE,
allTokens,
} from './formula.lexer';
export class FormulaParser extends CstParser {
// If only the formula entry is needed externally, only expose formula,
// other rules can be made private with _ruleName (optional).
public formula!: ParserMethod<[], CstNode>;
// Other rules can be read-only or fully public as needed
public expr!: ParserMethod<[], CstNode>;
public primary!: ParserMethod<[], CstNode>;
public field!: ParserMethod<[], CstNode>;
public bool!: ParserMethod<[], CstNode>;
public parens!: ParserMethod<[], CstNode>;
public func!: ParserMethod<[], CstNode>;
public list!: ParserMethod<[], CstNode>;
public unaryOperator!: ParserMethod<[], CstNode>;
public mulDiv!: ParserMethod<[], CstNode>;
public plusMinus!: ParserMethod<[], CstNode>;
public compare!: ParserMethod<[], CstNode>;
public error!: ParserMethod<[], CstNode>;
private c1: IOrAlt<any>[] | OrMethodOpts<any>;
private c2: IOrAlt<any>[] | OrMethodOpts<any>;
private c3: IOrAlt<any>[] | OrMethodOpts<any>;
private c4: IOrAlt<any>[] | OrMethodOpts<any>;
private c5: IOrAlt<any>[] | OrMethodOpts<any>;
constructor(config?: IParserConfig) {
// Pass allTokens to let the parser know available tokens
super(allTokens, config);
this.c1 = undefined;
this.c2 = undefined;
this.c3 = undefined;
this.c4 = undefined;
this.c5 = undefined;
const $ = this;
/**
* formula: Entry rule for the entire parsing process
* Explicitly consuming EOF is recommended to ensure remaining input is not ignored.
*/
$.formula = $.RULE('formula', () => {
$.SUBRULE($.expr);
// New: Consume EOF to raise an error if there are leftover tokens
$.CONSUME(EOF);
});
/**
* expr: Top-level expression, can include multiple operators (multiplication/division, addition/subtraction, comparison)
* First parses a primary, then continues if mulDiv/plusMinus/compare appears.
*/
$.expr = $.RULE('expr', () => {
$.SUBRULE($.compare);
});
/**
* primary: Basic expressions
* Lists all possible terminal nodes, such as FIELD, INT, DOUBLE, STRING, boolean, NULL, parentheses expressions, functions, lists, unary operations, etc.
*/
$.primary = $.RULE('primary', () => {
$.OR(
$.c1 ||
($.c1 = [
// prettier-ignore
{
GATE: () => {
return this.LA(1).tokenType === ID
&& this.LA(2).tokenType === LPAREN
},
ALT: () => $.SUBRULE($.func)
},
{ ALT: () => $.SUBRULE($.field) },
{ ALT: () => $.CONSUME(INT) },
{ ALT: () => $.CONSUME(DOUBLE) },
{ ALT: () => $.CONSUME(STRING) },
{ ALT: () => $.SUBRULE($.bool) },
{ ALT: () => $.CONSUME(NULL) },
{ ALT: () => $.SUBRULE($.parens) },
{ ALT: () => $.SUBRULE($.list) },
// { ALT: () => $.SUBRULE($.unaryOperator) },
]),
);
});
/**
* field: Parses FIELD or ID as a field reference
*/
$.field = $.RULE('field', () => {
$.OR(
$.c2 ||
($.c2 = [
// 1. Parses FIELD token
{ ALT: () => $.CONSUME(FIELD) },
// 2. Parses regular field access
{
ALT: () => {
$.CONSUME(ID); // First consume ID
$.MANY(() => {
$.OR2(
$.c3 ||
($.c3 = [
{
ALT: () => {
$.CONSUME(DOT); // '.'
$.CONSUME2(ID); // Second consume ID
},
},
{
ALT: () => {
$.CONSUME2(DOT); // '.'
$.CONSUME(LBRACKET); // '['
$.SUBRULE($.expr); // '1'
$.CONSUME(RBRACKET); // ']'
},
},
]),
);
});
},
},
]),
);
});
/**
* bool: Boolean literals (TRUE/FALSE)
*/
$.bool = $.RULE('bool', () => {
$.OR($.c4 || ($.c4 = [{ ALT: () => $.CONSUME(TRUE) }, { ALT: () => $.CONSUME(FALSE) }]));
});
/**
* parens: Parentheses expression ( expr )
* - LPAREN + expr + RPAREN
*/
$.parens = $.RULE('parens', () => {
$.CONSUME(LPAREN);
$.SUBRULE($.expr);
$.CONSUME(RPAREN);
});
/**
* func: Function calls
* Format: ID ( [expr(, expr)*] ), allows no arguments (OPTION)
*/
$.func = $.RULE('func', () => {
$.CONSUME(ID);
$.CONSUME(LPAREN);
$.OPTION(() => {
$.SUBRULE($.expr);
$.MANY(() => {
$.CONSUME(COMMA);
$.SUBRULE2($.expr);
});
});
$.CONSUME(RPAREN);
});
/**
* list: Lists [ expr (, expr)* ]
* Starts with LBRACKET '[', optional expr, multiple separated by commas, ends with RBRACKET ']'
*/
$.list = $.RULE('list', () => {
$.CONSUME(LBRACKET);
$.OPTION(() => {
$.SUBRULE($.expr);
$.MANY(() => {
$.CONSUME(COMMA);
$.SUBRULE2($.expr);
});
});
$.CONSUME(RBRACKET);
});
/**
* compare: Comparison operators
* =, !=, <, <=, >, >=
*/
$.compare = $.RULE('compare', () => {
$.SUBRULE($.plusMinus, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(COMPARE_OPS);
$.SUBRULE2($.plusMinus, { LABEL: 'rhs' });
});
});
/**
* plusMinus: Addition/Subtraction (binary operations)
* If + or - appears after expr, matches and consumes another primary
*/
$.plusMinus = $.RULE('plusMinus', () => {
$.SUBRULE($.mulDiv, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(PLUS_MINUS_OPS);
$.SUBRULE2($.mulDiv, { LABEL: 'rhs' });
});
});
/**
* mulDiv: Multiplication/Division (binary operations)
* Indicates that the current expr accepts a * or / and continues matching primary
*/
$.mulDiv = $.RULE('mulDiv', () => {
$.SUBRULE($.unaryOperator, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(MUL_DIV_OPS);
$.SUBRULE2($.unaryOperator, { LABEL: 'rhs' });
});
});
/**
* unaryOperator: Unary operations
* +primary / -primary
*/
$.unaryOperator = $.RULE('unaryOperator', () => {
$.OPTION(() => {
$.OR($.c5 || ($.c5 = [{ ALT: () => $.CONSUME(PLUS) }, { ALT: () => $.CONSUME(MINUS) }]));
});
$.SUBRULE($.primary);
});
$.error = $.RULE('error', () => {
$.SUBRULE($.expr);
$.SUBRULE2($.expr);
});
// Initialize the parser (mandatory)
$.performSelfAnalysis();
}
}const maxLookahead = 2;
const traceInitPerf = false;
const skipValidations = true;
const defaultParser = (() =>
new FormulaParser({
maxLookahead,
traceInitPerf,
skipValidations,
}))();
const locationTrackingParser = (() =>
new FormulaParser({
nodeLocationTracking: 'onlyOffset',
maxLookahead,
traceInitPerf,
skipValidations,
}))();
const createLexer = () =>
new Lexer(allTokens, {
ensureOptimizations: true,
traceInitPerf,
skipValidations,
}); |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 2 replies
-
|
Hey @hydra1983, I've converted this into a discussion. Have you tried profiling your code yet? Node.js features some pretty good profiling that should help you identify bottlenecks. Generally, Chevrotain has been the fastest option for parsing text in the JS space for me. However, it is easy to accidentally run into slow behavior. For example, do you get any parser errors? Or does the whole input parse without errors? Do you recreate the parser during your benchmark, or do you use the same object for the whole benchmark? |
Beta Was this translation helpful? Give feedback.
-
|
Hello @hydra1983 Modeling the binary operator precedence as part of the grammar tends to result in very deeply nested parse trees Perhaps Antlr has some optimizations around this pattern.
|
Beta Was this translation helpful? Give feedback.


Hello @hydra1983
Modeling the binary operator precedence as part of the grammar tends to result in very deeply nested parse trees
and a large performance overhead.
Perhaps Antlr has some optimizations around this pattern.
If you are building a productive grammar with Chevrotain which uses binary operators
I would recommend to apply the Swift programming language approach of parsing the operators as a flat list
and computing the precedence at a post-parsing phase.