-
Notifications
You must be signed in to change notification settings - Fork 215
Closed
Description
Performance test based on jest-bench
It took me three days to implement the g4 grammar using Chevrotain, but the performance turned out to be significantly worse than before.
Antlr4
grammar Formula;
// antlr的分析器定义文件
//////////////////// parser rules:
// The entry point of the grammar
formula
: expr
;
// Defines expressions based on operator precedence similar to JavaScript
// 运算符优先级按照JS的风格来,具体参考(https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Operators/Operator_Precedence)
expr
: FIELD # field // 字段 (e.g., #field.name#)
| INT # int // 整数 (e.g., 123)
| DOUBLE # double // 浮点数 (e.g., 123.45)
| String # str // 字符串 (e.g., "text")
| op=(TRUE | FALSE) # bool // 布尔值 (true/false)
| NULL # null // 空值 (null)
| LPAREN expr RPAREN # parens // 括号 (e.g., (a + b))
| ID LPAREN (expr (COMMA expr)*)? RPAREN # func // 函数调用 (e.g., SUM(a, b))
| '[' (expr (COMMA expr)*)? ']' # list // 数组 (e.g., [1, 2, 3])
| op=MINUS expr # unaryOperator // 一元运算符:负号(-)
| expr op=(MULTIPLY | DIVIDE) expr # mulDiv // 乘除法 (e.g., a * b or a / b)
| expr op=(PLUS | MINUS) expr # plusMinus // 加减法 (e.g., a + b or a - b)
| expr op=(EQ | NEQ | LT | LE | GT | GE) expr # compare // 比较符 (e.g., a > b, a == b)
| expr expr # error // 错误 (invalid expressions)
;
//////////////////// lexer rules:
// Arithmetic and comparison operators
PLUS : '+'; // Addition
MINUS : '-'; // Subtraction
MULTIPLY : '*'; // Multiplication
DIVIDE : '/'; // Division
// Parentheses for grouping expressions
LPAREN : '(' | '('; // Left parenthesis (including full-width version)
RPAREN : ')' | ')'; // Right parenthesis (including full-width version)
// Comparison operators
EQ : '=='; // Equal
NEQ : '!='; // Not equal
LT : '<'; // Less than
LE : '<='; // Less than or equal to
GT : '>'; // Greater than
GE : '>='; // Greater than or equal to
// Boolean and null literals
FALSE : 'FALSE' | 'false'; // Boolean false
TRUE : 'TRUE' | 'true'; // Boolean true
NULL : 'NULL' | 'null'; // Null value
// Comma as a separator (supports both half-width and full-width)
COMMA : ',' | ','; // Comma separator
// Identifier rule for variables and fields
ID
: [$_A-Za-z\u4e00-\u9fa5][_A-Za-z0-9\u4e00-\u9fa5]* | '[' [0-9*]+ ']'
; // Identifiers can include letters, numbers, and underscores
// Field rule, supporting both hashed and nested formats
FIELD
: '#' ID ('.' ID)* '#' | ID ('.' ID)*
; // e.g., #field.name# or field.name
// Integer literals
INT
: '0' | [1-9][0-9]*
; // e.g., 0, 123, 456
// Floating-point literals
DOUBLE
: ('0' | [1-9][0-9]*)('.'[0-9]*)?
; // e.g., 0.1, 123.45
// Whitespace, ignored during parsing
WS
: [ \t\r\n]+ -> skip
; // Skips spaces, tabs, and newlines
// String literals (supports both double quotes and Chinese quotation marks)
String
: '"' (StringCharacter+)? '"' | '“' (StringCharacter+)? '”'
; // e.g., "text", “text”
// Allowed characters within strings
StringCharacter
: ~["“”\\] | EscapeSequence
; // Any character except quotes or backslashes, or an escape sequence
// Escape sequences for strings
EscapeSequence
: '\\' [btnfr"“”'\\] | UnicodeEscape
; // e.g., \n, \t, \" or Unicode sequences
// Unicode escape sequence
UnicodeEscape
: '\\' 'u' HexDigit HexDigit HexDigit HexDigit
; // e.g., \u1234
// Hexadecimal digit
HexDigit
: [0-9a-fA-F]
; // Hexadecimal characters (0-9, A-F)Chevrontain v10.5.0
comments are generated by o1-mini
import { createToken, Lexer } from 'chevrotain';
/**
* -----------------------------------------
* Arithmetic and comparison operators
* -----------------------------------------
*/
export const PLUS_MINUS_OPS = createToken({
name: 'PLUS_MINUS_OPS',
pattern: Lexer.NA,
});
// Plus sign, e.g., 1 + 2
export const PLUS = createToken({ name: 'PLUS', pattern: /\+/, categories: PLUS_MINUS_OPS }); // Addition
// Minus sign, e.g., 1 - 2
export const MINUS = createToken({ name: 'MINUS', pattern: /-/, categories: PLUS_MINUS_OPS }); // Subtraction
export const MUL_DIV_OPS = createToken({
name: 'MUL_DIV_OPS',
pattern: Lexer.NA,
});
// Multiplication sign, e.g., 1 * 2
export const MULTIPLY = createToken({ name: 'MULTIPLY', pattern: /\*/, categories: MUL_DIV_OPS }); // Multiplication
// Division sign, e.g., 1 / 2
export const DIVIDE = createToken({ name: 'DIVIDE', pattern: /\//, categories: MUL_DIV_OPS }); // Division
/**
* -----------------------------------------
* Parentheses for grouping expressions
* (includes full-width versions)
* -----------------------------------------
*/
// Left parenthesis '(' or '('
export const LPAREN = createToken({ name: 'LPAREN', pattern: /\(|(/ }); // Left parenthesis
// Right parenthesis ')' or ')'
export const RPAREN = createToken({ name: 'RPAREN', pattern: /\)|)/ }); // Right parenthesis
/**
* -----------------------------------------
* Brackets for indexing or grouping
* -----------------------------------------
*/
// Left square bracket '['
export const LBRACKET = createToken({ name: 'LBRACKET', pattern: /\[/ }); // Left bracket
// Right square bracket ']'
export const RBRACKET = createToken({ name: 'RBRACKET', pattern: /\]/ }); // Right bracket
/**
* -----------------------------------------
* Comparison operators
* -----------------------------------------
*/
export const COMPARE_OPS = createToken({
name: 'COMPARE_OPS',
pattern: Lexer.NA,
});
// Less than or equal '<='
export const LE = createToken({ name: 'LE', pattern: /<=/, line_breaks: false, categories: COMPARE_OPS }); // Less Than or Equal
// Greater than or equal '>='
export const GE = createToken({ name: 'GE', pattern: />=/, line_breaks: false, categories: COMPARE_OPS }); // Greater Than or Equal
// Less than '<'
export const LT = createToken({ name: 'LT', pattern: /</, line_breaks: false, categories: COMPARE_OPS }); // Less Than
// Greater than '>'
export const GT = createToken({ name: 'GT', pattern: />/, line_breaks: false, categories: COMPARE_OPS }); // Greater Than
// Equal '=='
export const EQ = createToken({ name: 'EQ', pattern: /==/, line_breaks: false, categories: COMPARE_OPS }); // Equal
// Not equal '!='
export const NEQ = createToken({ name: 'NEQ', pattern: /!=/, line_breaks: false, categories: COMPARE_OPS }); // Not Equal
/**
* -----------------------------------------
* Boolean and null literals
* -----------------------------------------
*/
// Boolean true, matches 'TRUE' or 'true'
export const TRUE = createToken({ name: 'TRUE', pattern: /TRUE|true/, line_breaks: false }); // Boolean true
// Boolean false, matches 'FALSE' or 'false'
export const FALSE = createToken({ name: 'FALSE', pattern: /FALSE|false/, line_breaks: false }); // Boolean false
// Null value, matches 'NULL' or 'null'
export const NULL = createToken({ name: 'NULL', pattern: /NULL|null/, line_breaks: false }); // Null value
/**
* -----------------------------------------
* Comma as a separator (supports both half-width and full-width)
* -----------------------------------------
*/
// Comma separator, matches ',' or ','
export const COMMA = createToken({ name: 'COMMA', pattern: /,|,/ }); // Comma separator
/**
* -----------------------------------------
* Identifier rule for variables and fields
* -----------------------------------------
*/
// Define DOT token for matching '.' character
export const DOT = createToken({ name: 'DOT', pattern: /\./ });
// Define a regular expression for identifiers (ID_PATTERN)
// 1) Normal form: First character can be $_A-Za-z or Chinese, followed by $_A-Za-z0-9 or Chinese
// 2) Bracket form: [1], [2], [*], etc.
export const ID_PATTERN =
'(?:' + // Non-capturing group to handle two cases
// 1. Normal identifier: starts with [$_A-Za-z or Chinese], followed by [$_A-Za-z0-9 or Chinese]*
'[$_A-Za-z\\u4E00-\\u9FA5]' +
'[$_A-Za-z0-9\\u4E00-\\u9FA5]*' +
// Or
'|' +
// 2. Bracketed identifier: '[' followed by any combination of numbers or '*', ending with ']'
'\\[' +
'[0-9*]+' +
'\\]' +
')';
// Generate ID token for normal variable names or forms like [1]
export const ID = createToken({ name: 'ID', pattern: new RegExp(ID_PATTERN), line_breaks: false }); // Use helper pattern for ID
/**
* -----------------------------------------
* Field rule, reusing ID pattern
* -----------------------------------------
*/
// Define FIELD regular expression (FIELD_PATTERN):
// 1) Wrapped in hashes: #ID(.ID)*#
// 2) Without hashes: ID(.ID)*
// ".ID" indicates multiple sub-IDs connected with '.' (nested fields)
export const FIELD_PATTERN =
'(?:' + // Non-capturing group to handle two field formats
// 1. Wrapped in hashes
'#' +
ID_PATTERN +
'(?:' + // Non-capturing group for zero or more ".ID"
'\\.' +
ID_PATTERN +
')*' +
'#' +
// Or
'|' +
// 2. Regular nested fields
ID_PATTERN +
'(?:' +
'\\.' +
ID_PATTERN +
')*' +
')';
// Declare FIELD token; matches longer strings in preference to ID
export const FIELD = createToken({
name: 'FIELD',
pattern: new RegExp(FIELD_PATTERN),
longer_alt: ID,
}); // Reuse ID pattern in FIELD definition
/**
* -----------------------------------------
* Integer literals
* -----------------------------------------
*/
// Integer, e.g., 123
export const INT = createToken({ name: 'INT', pattern: /\d+/, line_breaks: false }); // Integer literals
/**
* -----------------------------------------
* Floating-point literals
* -----------------------------------------
*/
// Floating-point number, e.g., 123.45
export const DOUBLE = createToken({
name: 'DOUBLE',
pattern: /\d+\.\d+/,
line_breaks: false,
}); // Floating-point literals
/**
* -----------------------------------------
* String-related tokens
*
* Demonstrates how to reuse sub-patterns step by step:
* HexDigit -> UnicodeEscape -> EscapeSequence -> StringCharacter -> STRING
* -----------------------------------------
*/
// Hexadecimal characters [0-9a-fA-F]
export const HEX_DIGIT_PATTERN = '[0-9a-fA-F]';
// UnicodeEscape: Format \u1234 (backslash + 'u' + 4 HexDigits)
export const UNICODE_ESCAPE_PATTERN =
'\\u' + // Double backslashes represent a single backslash character
'' +
HEX_DIGIT_PATTERN +
'{4}';
// EscapeSequence: Includes \uXXXX or regular escape characters (\n, \t, \" etc.)
export const ESCAPE_SEQUENCE_PATTERN =
'\\\\[btnfr"“”\'\\\\]' + // Regular escapes, e.g., \n, \t, \", ...
'|' +
UNICODE_ESCAPE_PATTERN; // Include UnicodeEscape
/**
* StringCharacter represents a single character within a string:
* - Regular character (not quotes or backslash)
* - Or an escape sequence (EscapeSequence)
*/
export const STRING_CHARACTER_PATTERN =
'[^"“”\\\\]' + // Non-quote, non-backslash
'|' +
ESCAPE_SEQUENCE_PATTERN; // Reuse EscapeSequence
/**
* STRING_PATTERN:
* Supports both regular English quotes "..." and Chinese quotes “...”.
* Can contain multiple StringCharacters.
*/
const STRING_PATTERN =
'(?:' +
// English quotes:
'"' +
'(?:' +
STRING_CHARACTER_PATTERN +
')*' +
'"' +
')' +
'|' +
'(?:' +
// Chinese quotes:
'“' +
'(?:' +
STRING_CHARACTER_PATTERN +
')*' +
'”' +
')';
// Declare StringLiteral token to match complete strings
export const STRING = createToken({
name: 'STRING',
pattern: new RegExp(STRING_PATTERN),
line_breaks: true,
start_chars_hint: ['"“'],
});
/**
* -----------------------------------------
* Whitespace, ignored during parsing
* -----------------------------------------
*/
// Matches whitespace characters [ \t\r\n]+, skipped during parsing
export const WS = createToken({
name: 'WS',
pattern: /\s+/,
group: Lexer.SKIPPED,
}); // Skips spaces, tabs, and newlines
/**
* -----------------------------------------
* Export all tokens as an array
* in the order specified in the .g4 file
* -----------------------------------------
*/
export const allTokens = [
// Whitespace, ignored during parsing
WS,
// Comparison operators
EQ,
NEQ,
LE,
GE,
LT,
GT,
// Arithmetic and comparison operators
PLUS,
MINUS,
MULTIPLY,
DIVIDE,
// Parentheses for grouping expressions
LPAREN,
RPAREN,
LBRACKET,
RBRACKET,
// Comma as a separator (supports both half-width and full-width)
COMMA,
// Identifier rule for variables and fields
DOT,
// Boolean and null literals
TRUE,
FALSE,
NULL,
ID,
// Field rule, supporting both hashed (#) and nested formats
FIELD,
// Floating-point literals
DOUBLE,
// Integer literals
INT,
// String literals (supports both double quotes and Chinese quotation marks)
STRING,
// StringCharacter,
// EscapeSequence,
// UnicodeEscape,
// HexDigit,
PLUS_MINUS_OPS,
MUL_DIV_OPS,
COMPARE_OPS,
];import { IOrAlt, IParserConfig, OrMethodOpts } from '@chevrotain/types';
import { CstNode, CstParser, EOF, ParserMethod } from 'chevrotain';
import {
COMMA,
COMPARE_OPS,
DOT,
DOUBLE,
FALSE,
FIELD,
ID,
INT,
LBRACKET,
LPAREN,
MINUS,
MUL_DIV_OPS,
NULL,
PLUS,
PLUS_MINUS_OPS,
RBRACKET,
RPAREN,
STRING,
TRUE,
allTokens,
} from './formula.lexer';
export class FormulaParser extends CstParser {
// If only the formula entry is needed externally, only expose formula,
// other rules can be made private with _ruleName (optional).
public formula!: ParserMethod<[], CstNode>;
// Other rules can be read-only or fully public as needed
public expr!: ParserMethod<[], CstNode>;
public primary!: ParserMethod<[], CstNode>;
public field!: ParserMethod<[], CstNode>;
public bool!: ParserMethod<[], CstNode>;
public parens!: ParserMethod<[], CstNode>;
public func!: ParserMethod<[], CstNode>;
public list!: ParserMethod<[], CstNode>;
public unaryOperator!: ParserMethod<[], CstNode>;
public mulDiv!: ParserMethod<[], CstNode>;
public plusMinus!: ParserMethod<[], CstNode>;
public compare!: ParserMethod<[], CstNode>;
public error!: ParserMethod<[], CstNode>;
private c1: IOrAlt<any>[] | OrMethodOpts<any>;
private c2: IOrAlt<any>[] | OrMethodOpts<any>;
private c3: IOrAlt<any>[] | OrMethodOpts<any>;
private c4: IOrAlt<any>[] | OrMethodOpts<any>;
private c5: IOrAlt<any>[] | OrMethodOpts<any>;
constructor(config?: IParserConfig) {
// Pass allTokens to let the parser know available tokens
super(allTokens, config);
this.c1 = undefined;
this.c2 = undefined;
this.c3 = undefined;
this.c4 = undefined;
this.c5 = undefined;
const $ = this;
/**
* formula: Entry rule for the entire parsing process
* Explicitly consuming EOF is recommended to ensure remaining input is not ignored.
*/
$.formula = $.RULE('formula', () => {
$.SUBRULE($.expr);
// New: Consume EOF to raise an error if there are leftover tokens
$.CONSUME(EOF);
});
/**
* expr: Top-level expression, can include multiple operators (multiplication/division, addition/subtraction, comparison)
* First parses a primary, then continues if mulDiv/plusMinus/compare appears.
*/
$.expr = $.RULE('expr', () => {
$.SUBRULE($.compare);
});
/**
* primary: Basic expressions
* Lists all possible terminal nodes, such as FIELD, INT, DOUBLE, STRING, boolean, NULL, parentheses expressions, functions, lists, unary operations, etc.
*/
$.primary = $.RULE('primary', () => {
$.OR(
$.c1 ||
($.c1 = [
// prettier-ignore
{
GATE: () => {
return this.LA(1).tokenType === ID
&& this.LA(2).tokenType === LPAREN
},
ALT: () => $.SUBRULE($.func)
},
{ ALT: () => $.SUBRULE($.field) },
{ ALT: () => $.CONSUME(INT) },
{ ALT: () => $.CONSUME(DOUBLE) },
{ ALT: () => $.CONSUME(STRING) },
{ ALT: () => $.SUBRULE($.bool) },
{ ALT: () => $.CONSUME(NULL) },
{ ALT: () => $.SUBRULE($.parens) },
{ ALT: () => $.SUBRULE($.list) },
// { ALT: () => $.SUBRULE($.unaryOperator) },
]),
);
});
/**
* field: Parses FIELD or ID as a field reference
*/
$.field = $.RULE('field', () => {
$.OR(
$.c2 ||
($.c2 = [
// 1. Parses FIELD token
{ ALT: () => $.CONSUME(FIELD) },
// 2. Parses regular field access
{
ALT: () => {
$.CONSUME(ID); // First consume ID
$.MANY(() => {
$.OR2(
$.c3 ||
($.c3 = [
{
ALT: () => {
$.CONSUME(DOT); // '.'
$.CONSUME2(ID); // Second consume ID
},
},
{
ALT: () => {
$.CONSUME2(DOT); // '.'
$.CONSUME(LBRACKET); // '['
$.SUBRULE($.expr); // '1'
$.CONSUME(RBRACKET); // ']'
},
},
]),
);
});
},
},
]),
);
});
/**
* bool: Boolean literals (TRUE/FALSE)
*/
$.bool = $.RULE('bool', () => {
$.OR($.c4 || ($.c4 = [{ ALT: () => $.CONSUME(TRUE) }, { ALT: () => $.CONSUME(FALSE) }]));
});
/**
* parens: Parentheses expression ( expr )
* - LPAREN + expr + RPAREN
*/
$.parens = $.RULE('parens', () => {
$.CONSUME(LPAREN);
$.SUBRULE($.expr);
$.CONSUME(RPAREN);
});
/**
* func: Function calls
* Format: ID ( [expr(, expr)*] ), allows no arguments (OPTION)
*/
$.func = $.RULE('func', () => {
$.CONSUME(ID);
$.CONSUME(LPAREN);
$.OPTION(() => {
$.SUBRULE($.expr);
$.MANY(() => {
$.CONSUME(COMMA);
$.SUBRULE2($.expr);
});
});
$.CONSUME(RPAREN);
});
/**
* list: Lists [ expr (, expr)* ]
* Starts with LBRACKET '[', optional expr, multiple separated by commas, ends with RBRACKET ']'
*/
$.list = $.RULE('list', () => {
$.CONSUME(LBRACKET);
$.OPTION(() => {
$.SUBRULE($.expr);
$.MANY(() => {
$.CONSUME(COMMA);
$.SUBRULE2($.expr);
});
});
$.CONSUME(RBRACKET);
});
/**
* compare: Comparison operators
* =, !=, <, <=, >, >=
*/
$.compare = $.RULE('compare', () => {
$.SUBRULE($.plusMinus, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(COMPARE_OPS);
$.SUBRULE2($.plusMinus, { LABEL: 'rhs' });
});
});
/**
* plusMinus: Addition/Subtraction (binary operations)
* If + or - appears after expr, matches and consumes another primary
*/
$.plusMinus = $.RULE('plusMinus', () => {
$.SUBRULE($.mulDiv, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(PLUS_MINUS_OPS);
$.SUBRULE2($.mulDiv, { LABEL: 'rhs' });
});
});
/**
* mulDiv: Multiplication/Division (binary operations)
* Indicates that the current expr accepts a * or / and continues matching primary
*/
$.mulDiv = $.RULE('mulDiv', () => {
$.SUBRULE($.unaryOperator, { LABEL: 'lhs' });
$.MANY(() => {
$.CONSUME(MUL_DIV_OPS);
$.SUBRULE2($.unaryOperator, { LABEL: 'rhs' });
});
});
/**
* unaryOperator: Unary operations
* +primary / -primary
*/
$.unaryOperator = $.RULE('unaryOperator', () => {
$.OPTION(() => {
$.OR($.c5 || ($.c5 = [{ ALT: () => $.CONSUME(PLUS) }, { ALT: () => $.CONSUME(MINUS) }]));
});
$.SUBRULE($.primary);
});
$.error = $.RULE('error', () => {
$.SUBRULE($.expr);
$.SUBRULE2($.expr);
});
// Initialize the parser (mandatory)
$.performSelfAnalysis();
}
}const maxLookahead = 2;
const traceInitPerf = false;
const skipValidations = true;
const defaultParser = (() =>
new FormulaParser({
maxLookahead,
traceInitPerf,
skipValidations,
}))();
const locationTrackingParser = (() =>
new FormulaParser({
nodeLocationTracking: 'onlyOffset',
maxLookahead,
traceInitPerf,
skipValidations,
}))();
const createLexer = () =>
new Lexer(allTokens, {
ensureOptimizations: true,
traceInitPerf,
skipValidations,
});Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels

