Skip to content

Commit f4cbfd4

Browse files
committed
Optimizing lexer
1 parent 3f7d2bb commit f4cbfd4

File tree

1 file changed

+19
-49
lines changed

1 file changed

+19
-49
lines changed

src/Lexer.php

Lines changed: 19 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ class Lexer
88
{
99
private $regex, $offsetToToken;
1010

11-
/** @var array Map of regular expressions to their token match value */
12-
private $tokenMap = [
11+
private $tokens = [
1312
'[a-zA-Z_][a-zA-Z_0-9]*' => 'identifier',
1413
'\.' => 'dot',
1514
'\*' => 'star',
@@ -41,16 +40,13 @@ class Lexer
4140

4241
public function __construct()
4342
{
44-
$this->regex = '((' .
45-
implode(')|(', array_keys($this->tokenMap)) . '))';
46-
$this->offsetToToken = array_values($this->tokenMap);
43+
$this->regex = '((' . implode(')|(', array_keys($this->tokens)) . '))';
44+
$this->offsetToToken = array_values($this->tokens);
4745
}
4846

4947
/**
50-
* Tokenize the JMESPath expression into an array of tokens.
51-
*
52-
* Each token array contains a type, value, and pos key along with any
53-
* other keys that might be relevant to the particular token.
48+
* Tokenize the JMESPath expression into an array of tokens hashes that
49+
* contain a 'type', 'value', and 'key'.
5450
*
5551
* @param string $input JMESPath input
5652
*
@@ -59,23 +55,16 @@ public function __construct()
5955
*/
6056
public function tokenize($input)
6157
{
62-
$offset = 0;
63-
$tokens = [];
64-
6558
if (!preg_match_all($this->regex, $input, $matches, PREG_SET_ORDER)) {
66-
$this->throwSyntax('Invalid expression', $offset, $input);
59+
throw $this->throwSyntax('Invalid expression', 0, $input);
6760
}
6861

62+
$offset = 0;
63+
$tokens = [];
6964
foreach ($matches as $match) {
7065
$type = $this->offsetToToken[count($match) - 2];
71-
7266
if ($type !== 'skip') {
73-
$token = [
74-
'type' => $type,
75-
'value' => $match[0],
76-
'pos' => $offset
77-
];
78-
67+
$token = ['type' => $type, 'value' => $match[0], 'pos' => $offset];
7968
switch ($token['type']) {
8069
case 'quoted_identifier':
8170
$token['value'] = $this->decodeJson(
@@ -101,10 +90,8 @@ public function tokenize($input)
10190
$offset += strlen($match[0]);
10291
}
10392

104-
// Always end the token stream with an EOF token
10593
$tokens[] = ['type' => 'eof', 'pos' => $offset, 'value' => null];
10694

107-
// Ensure that the expression did not contain invalid characters
10895
if (strlen($input) != $offset) {
10996
$this->invalidExpression($input);
11097
}
@@ -114,31 +101,16 @@ public function tokenize($input)
114101

115102
private function takeLiteral($value, $offset, $input)
116103
{
117-
// Maps common JavaScript primitives with a native PHP primitive
118-
static $primitives = ['true' => 0, 'false' => 1, 'null' => 2];
119-
static $primitiveMap = [true, false, null];
120-
// If a literal starts with these characters, it is JSON decoded
121-
static $decodeCharacters = ['"' => 1, '[' => 1, '{' => 1];
122-
104+
static $valid = '/(true|false|null)|(^[\["{])|(^\-?[0-9]*(\.[0-9]+)?([e|E][+|\-][0-9]+)?$)/';
123105
$value = str_replace('\\`', '`', ltrim(substr($value, 1, -1)));
124106

125-
if (isset($primitives[$value])) {
126-
// Fast lookups for common JSON primitives
127-
return $primitiveMap[$primitives[$value]];
128-
} elseif (strlen($value) == 0) {
129-
$this->throwSyntax('Empty JSON literal', $offset, $input);
130-
} elseif (isset($decodeCharacters[$value[0]])) {
131-
// Always decode the JSON directly if it starts with these chars
132-
return $this->decodeJson($value, $offset, $input);
133-
} elseif (preg_match(
134-
'/^\-?[0-9]*(\.[0-9]+)?([e|E][+|\-][0-9]+)?$/',
135-
$value
136-
)) {
137-
// If it starts with a "-" or numbers, then attempt to JSON decode
138-
return $this->decodeJson($value, $offset, $input);
107+
if ($value === '') {
108+
throw $this->throwSyntax('Empty JSON literal', $offset, $input);
139109
}
140110

141-
return $this->decodeJson('"' . $value . '"', $offset, $input);
111+
return preg_match($valid, $value)
112+
? $this->decodeJson($value, $offset, $input)
113+
: $this->decodeJson('"' . $value . '"', $offset, $input);
142114
}
143115

144116
private function decodeJson($json, $offset, $input)
@@ -155,7 +127,7 @@ private function decodeJson($json, $offset, $input)
155127

156128
if ($error = json_last_error()) {
157129
$message = isset($errs[$error]) ? $errs[$error] : 'Unknown error';
158-
$this->throwSyntax(
130+
throw $this->throwSyntax(
159131
"Error decoding JSON: ({$error}) {$message}, given {$json}",
160132
$offset,
161133
$input
@@ -167,7 +139,7 @@ private function decodeJson($json, $offset, $input)
167139

168140
private function throwSyntax($message, $offset, $input)
169141
{
170-
throw new SyntaxErrorException(
142+
return new SyntaxErrorException(
171143
$message,
172144
['value' => substr($input, $offset, 1), 'pos' => $offset],
173145
$input
@@ -177,12 +149,10 @@ private function throwSyntax($message, $offset, $input)
177149
private function invalidExpression($input)
178150
{
179151
$offset = 0;
180-
$regex = $this->regex . 'A';
181-
182-
while (preg_match($regex, $input, $matches, 0, $offset)) {
152+
while (preg_match("{$this->regex}A", $input, $matches, 0, $offset)) {
183153
$offset += strlen($matches[0]);
184154
}
185155

186-
$this->throwSyntax('Unexpected character', $offset, $input);
156+
throw $this->throwSyntax('Unexpected character', $offset, $input);
187157
}
188158
}

0 commit comments

Comments
 (0)