Skip to content

Commit b4e9c9c

Browse files
committed
Improving Lexer speed 2-3x
Updating with latest compliance tests. Removing deprecated literal parsing with elided quotes (JEP 12) Fixing a few precedence issues
1 parent 119ddb7 commit b4e9c9c

File tree

9 files changed

+382
-253
lines changed

9 files changed

+382
-253
lines changed

src/Lexer.php

Lines changed: 249 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -6,43 +6,61 @@
66
*/
77
class Lexer
88
{
9-
private $regex, $offsetToToken;
10-
11-
private $tokens = [
12-
'[a-zA-Z_][a-zA-Z_0-9]*' => 'identifier',
13-
'\.' => 'dot',
14-
'\*' => 'star',
15-
'\[\]' => 'flatten',
16-
'-?\d+' => 'number',
17-
'\|\|' => 'or',
18-
'\|' => 'pipe',
19-
'\[\?' => 'filter',
20-
'\[' => 'lbracket',
21-
'\]' => 'rbracket',
22-
'\'(?:\\\\\\\\|\\\\\'|[^\'])*\'' => 'raw_string',
23-
'"(?:\\\\\\\\|\\\\"|[^"])*"' => 'quoted_identifier',
24-
'`(?:\\\\\\\\|\\\\`|[^`])*`' => 'literal',
25-
',' => 'comma',
26-
':' => 'colon',
27-
'@' => 'current',
28-
'&' => 'expref',
29-
'\(' => 'lparen',
30-
'\)' => 'rparen',
31-
'\{' => 'lbrace',
32-
'\}' => 'rbrace',
33-
'!=' => 'comparator',
34-
'==' => 'comparator',
35-
'<=' => 'comparator',
36-
'>=' => 'comparator',
37-
'<' => 'comparator',
38-
'>' => 'comparator',
39-
'[ \t]' => 'skip',
9+
/** @var array Characters that can start an identifier */
10+
private $startIdentifier = [
11+
'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,
12+
'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,
13+
'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,
14+
'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,
15+
'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,
16+
'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,
17+
'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,
18+
'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,
19+
'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,
20+
't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,
21+
'y' => true, 'z' => true, '_' => true,
22+
];
23+
24+
/** @var array Number characters */
25+
private $numbers = [
26+
0 => true, 1 => true, 2 => true, 3 => true, 4 => true, 5 => true,
27+
6 => true, 7 => true, 8 => true, 9 => true,
28+
];
29+
30+
/** @var array Characters that can start a number (ctor calculated) */
31+
private $startNumber;
32+
33+
/** @var array Valid identifier characters (ctor calculated) */
34+
private $validIdentifier;
35+
36+
/** @var array Map of simple single character tokens */
37+
private $simpleTokens = [
38+
'.' => 'dot',
39+
'*' => 'star',
40+
']' => 'rbracket',
41+
',' => 'comma',
42+
':' => 'colon',
43+
'@' => 'current',
44+
'&' => 'expref',
45+
'(' => 'lparen',
46+
')' => 'rparen',
47+
'{' => 'lbrace',
48+
'}' => 'rbrace',
49+
];
50+
51+
/** @var array Map of whitespace characters */
52+
private $whitespace = [
53+
' ' => 'skip',
54+
"\t" => 'skip',
55+
"\n" => 'skip',
56+
"\r" => 'skip',
4057
];
4158

4259
public function __construct()
4360
{
44-
$this->regex = '((' . implode(')|(', array_keys($this->tokens)) . '))';
45-
$this->offsetToToken = array_values($this->tokens);
61+
$this->validIdentifier = $this->startIdentifier + $this->numbers;
62+
$this->startNumber = $this->numbers;
63+
$this->startNumber['-'] = true;
4664
}
4765

4866
/**
@@ -56,101 +74,228 @@ public function __construct()
5674
*/
5775
public function tokenize($input)
5876
{
59-
if (!preg_match_all($this->regex, $input, $matches, PREG_SET_ORDER)) {
60-
throw $this->throwSyntax('Invalid expression', 0, $input);
77+
if ($input === '') {
78+
goto eof;
6179
}
6280

63-
$offset = 0;
81+
$chars = str_split($input);
6482
$tokens = [];
65-
foreach ($matches as $match) {
66-
$type = $this->offsetToToken[count($match) - 2];
67-
if ($type !== 'skip') {
68-
$token = ['type' => $type, 'value' => $match[0], 'pos' => $offset];
69-
switch ($token['type']) {
70-
case 'quoted_identifier':
71-
$token['value'] = $this->decodeJson(
72-
$token['value'], $offset, $input
73-
);
74-
break;
75-
case 'number':
76-
$token['value'] = (int) $token['value'];
77-
break;
78-
case 'literal':
79-
$token['value'] = $this->literal(
80-
$token['value'], $offset, $input
81-
);
82-
break;
83-
case 'raw_string':
84-
$token['type'] = 'literal';
85-
$token['value'] = substr($token['value'], 1, -1);
86-
$token['value'] = str_replace("\\'", "'", $token['value']);
87-
break;
88-
}
89-
$tokens[] = $token;
83+
84+
consume:
85+
86+
$current = current($chars);
87+
88+
if ($current === false) {
89+
goto eof;
90+
}
91+
92+
if (isset($this->simpleTokens[$current])) {
93+
// Consume simple tokens like ".", ",", "@", etc.
94+
$tokens[] = [
95+
'type' => $this->simpleTokens[$current],
96+
'pos' => key($chars),
97+
'value' => $current
98+
];
99+
next($chars);
100+
} elseif (isset($this->whitespace[$current])) {
101+
// Skip whitespace
102+
next($chars);
103+
} elseif (isset($this->startIdentifier[$current])) {
104+
// Consume identifiers
105+
$start = key($chars);
106+
$buffer = '';
107+
do {
108+
$buffer .= $current;
109+
$current = next($chars);
110+
} while ($current !== false && isset($this->validIdentifier[$current]));
111+
$tokens[] = [
112+
'type' => 'identifier',
113+
'value' => $buffer,
114+
'pos' => $start
115+
];
116+
} elseif (isset($this->startNumber[$current])) {
117+
// Consume numbers
118+
$start = key($chars);
119+
$buffer = '';
120+
do {
121+
$buffer .= $current;
122+
$current = next($chars);
123+
} while ($current !== false && isset($this->numbers[$current]));
124+
$tokens[] = [
125+
'type' => 'number',
126+
'value' => (int) $buffer,
127+
'pos' => $start
128+
];
129+
} elseif ($current === '|') {
130+
// Consume pipe and OR
131+
$tokens[] = $this->matchOr($chars, '|', '|', 'or', 'pipe');
132+
} elseif ($current === '[') {
133+
// Consume "[", "[?", and "[]"
134+
$position = key($chars);
135+
$actual = next($chars);
136+
if ($actual === ']') {
137+
next($chars);
138+
$tokens[] = [
139+
'type' => 'flatten',
140+
'pos' => $position,
141+
'value' => '[]'
142+
];
143+
} elseif ($actual === '?') {
144+
next($chars);
145+
$tokens[] = [
146+
'type' => 'filter',
147+
'pos' => $position,
148+
'value' => '[?'
149+
];
150+
} else {
151+
$tokens[] = [
152+
'type' => 'lbracket',
153+
'pos' => $position,
154+
'value' => '['
155+
];
156+
}
157+
} elseif ($current === "'") {
158+
// Consume raw string literals
159+
$tokens[] = $this->inside($chars, "'", 'literal');
160+
} elseif ($current === "`") {
161+
// Consume JSON literals
162+
$token = $this->inside($chars, '`', 'literal');
163+
if ($token['type'] === 'literal') {
164+
$token['value'] = str_replace('\\`', '`', $token['value']);
165+
$token = $this->parseJson($token);
90166
}
91-
$offset += strlen($match[0]);
167+
$tokens[] = $token;
168+
} elseif ($current === '"') {
169+
// Consume quoted identifiers
170+
$token = $this->inside($chars, '"', 'quoted_identifier');
171+
if ($token['type'] === 'quoted_identifier') {
172+
$token['value'] = '"' . $token['value'] . '"';
173+
$token = $this->parseJson($token);
174+
}
175+
$tokens[] = $token;
176+
} elseif ($current === '!') {
177+
// Consume not equal
178+
$tokens[] = $this->matchOr($chars, '!', '=', 'comparator', 'unknown');
179+
} elseif ($current === '>' || $current === '<') {
180+
// Consume less than and greater than
181+
$tokens[] = $this->matchOr($chars, $current, '=', 'comparator', 'comparator');
182+
} elseif ($current === '=') {
183+
// Consume equals
184+
$tokens[] = $this->matchOr($chars, '=', '=', 'comparator', 'unknown');
185+
} else {
186+
$tokens[] = [
187+
'type' => 'unknown',
188+
'pos' => key($chars),
189+
'value' => $current
190+
];
191+
next($chars);
92192
}
93193

94-
$tokens[] = ['type' => 'eof', 'pos' => $offset, 'value' => null];
194+
goto consume;
95195

96-
if (strlen($input) != $offset) {
97-
$this->invalidExpression($input);
196+
eof: {
197+
$tokens[] = [
198+
'type' => 'eof',
199+
'pos' => strlen($input),
200+
'value' => null
201+
];
202+
return $tokens;
98203
}
99-
100-
return $tokens;
101204
}
102205

103-
private function literal($value, $offset, $input)
206+
/**
207+
* Returns a token based on whether or not the next token matches the
208+
* expected value. If it does, a token of "$type" is returned. Otherwise,
209+
* a token of "$orElse" type is returned.
210+
*
211+
* @param array $chars Array of characters by reference.
212+
* @param string $current The current character.
213+
* @param string $expected Expected character.
214+
* @param string $type Expected result type.
215+
* @param string $orElse Otherwise return a token of this type.
216+
*
217+
* @return array Returns a conditional token.
218+
*/
219+
private function matchOr(array &$chars, $current, $expected, $type, $orElse)
104220
{
105-
// Handles true, false, null, numbers, quoted strings, "[", and "{"
106-
static $valid = '/(true|false|null)|(^[\["{])|(^\-?[0-9]*(\.[0-9]+)?([e|E][+|\-][0-9]+)?$)/';
107-
$value = str_replace('\\`', '`', ltrim(substr($value, 1, -1)));
221+
$position = key($chars);
222+
$actual = next($chars);
223+
224+
if ($actual === $expected) {
225+
next($chars);
226+
return [
227+
'type' => $type,
228+
'pos' => $position,
229+
'value' => $current . $expected
230+
];
231+
}
108232

109-
return preg_match($valid, $value) && $value !== ''
110-
? $this->decodeJson($value, $offset, $input)
111-
: $this->decodeJson('"' . $value . '"', $offset, $input);
233+
return [
234+
'type' => $orElse,
235+
'pos' => $position,
236+
'value' => $current
237+
];
112238
}
113239

114-
private function decodeJson($json, $offset, $input)
240+
/**
241+
* Returns a token the is the result of consuming inside of delimiter
242+
* characters. Escaped delimiters will be adjusted before returning a
243+
* value. If the token is not closed, "unknown" is returned.
244+
*
245+
* @param array $chars Array of characters by reference.
246+
* @param string $delim The delimiter character.
247+
* @param string $type Token type.
248+
*
249+
* @return array Returns the consumed token.
250+
*/
251+
private function inside(array &$chars, $delim, $type)
115252
{
116-
static $errs = [
117-
JSON_ERROR_DEPTH => 'JSON_ERROR_DEPTH',
118-
JSON_ERROR_STATE_MISMATCH => 'JSON_ERROR_STATE_MISMATCH',
119-
JSON_ERROR_CTRL_CHAR => 'JSON_ERROR_CTRL_CHAR',
120-
JSON_ERROR_SYNTAX => 'JSON_ERROR_SYNTAX',
121-
JSON_ERROR_UTF8 => 'JSON_ERROR_UTF8'
122-
];
253+
$position = key($chars);
254+
$current = next($chars);
255+
$buffer = '';
123256

124-
$value = json_decode($json, true);
257+
while ($current !== $delim) {
258+
259+
if ($current === '\\') {
260+
$buffer .= '\\';
261+
$current = next($chars);
262+
}
263+
264+
if ($current === false) {
265+
return [
266+
'type' => 'unknown',
267+
'value' => $buffer,
268+
'pos' => $position
269+
];
270+
}
271+
272+
$buffer .= $current;
273+
$current = next($chars);
125274

126-
if ($error = json_last_error()) {
127-
$message = isset($errs[$error]) ? $errs[$error] : 'Unknown error';
128-
throw $this->throwSyntax(
129-
"Error decoding JSON: ({$error}) {$message}, given {$json}",
130-
$offset,
131-
$input
132-
);
133275
}
134276

135-
return $value;
136-
}
277+
next($chars);
137278

138-
private function throwSyntax($message, $offset, $input)
139-
{
140-
return new SyntaxErrorException(
141-
$message,
142-
['value' => substr($input, $offset, 1), 'pos' => $offset],
143-
$input
144-
);
279+
return ['type' => $type, 'value' => $buffer, 'pos' => $position];
145280
}
146281

147-
private function invalidExpression($input)
282+
/**
283+
* Parses a JSON token or sets the token type to "unknown" on error.
284+
*
285+
* @param array $token Token that needs parsing.
286+
*
287+
* @return array Returns a token with a parsed value.
288+
*/
289+
private function parseJson(array $token)
148290
{
149-
$offset = 0;
150-
while (preg_match("{$this->regex}A", $input, $matches, 0, $offset)) {
151-
$offset += strlen($matches[0]);
291+
$value = json_decode($token['value'], true);
292+
293+
if ($error = json_last_error()) {
294+
$token['type'] = 'unknown';
295+
return $token;
152296
}
153297

154-
throw $this->throwSyntax('Unexpected character', $offset, $input);
298+
$token['value'] = $value;
299+
return $token;
155300
}
156301
}

0 commit comments

Comments
 (0)