66 */
77class Lexer
88{
9- private $ regex , $ offsetToToken ;
10-
11- private $ tokens = [
12- '[a-zA-Z_][a-zA-Z_0-9]* ' => 'identifier ' ,
13- '\. ' => 'dot ' ,
14- '\* ' => 'star ' ,
15- '\[\] ' => 'flatten ' ,
16- '-?\d+ ' => 'number ' ,
17- '\|\| ' => 'or ' ,
18- '\| ' => 'pipe ' ,
19- '\[\? ' => 'filter ' ,
20- '\[ ' => 'lbracket ' ,
21- '\] ' => 'rbracket ' ,
22- '\'(?: \\\\\\\\| \\\\\'|[^ \'])* \'' => 'raw_string ' ,
23- '"(?: \\\\\\\\| \\\\"|[^"])*" ' => 'quoted_identifier ' ,
24- '`(?: \\\\\\\\| \\\\`|[^`])*` ' => 'literal ' ,
25- ', ' => 'comma ' ,
26- ': ' => 'colon ' ,
27- '@ ' => 'current ' ,
28- '& ' => 'expref ' ,
29- '\( ' => 'lparen ' ,
30- '\) ' => 'rparen ' ,
31- '\{ ' => 'lbrace ' ,
32- '\} ' => 'rbrace ' ,
33- '!= ' => 'comparator ' ,
34- '== ' => 'comparator ' ,
35- '<= ' => 'comparator ' ,
36- '>= ' => 'comparator ' ,
37- '< ' => 'comparator ' ,
38- '> ' => 'comparator ' ,
39- '[ \t] ' => 'skip ' ,
9+ /** @var array Characters that can start an identifier */
10+ private $ startIdentifier = [
11+ 'A ' => true , 'B ' => true , 'C ' => true , 'D ' => true , 'E ' => true ,
12+ 'F ' => true , 'G ' => true , 'H ' => true , 'I ' => true , 'J ' => true ,
13+ 'K ' => true , 'L ' => true , 'M ' => true , 'N ' => true , 'O ' => true ,
14+ 'P ' => true , 'Q ' => true , 'R ' => true , 'S ' => true , 'T ' => true ,
15+ 'U ' => true , 'V ' => true , 'W ' => true , 'X ' => true , 'Y ' => true ,
16+ 'Z ' => true , 'a ' => true , 'b ' => true , 'c ' => true , 'd ' => true ,
17+ 'e ' => true , 'f ' => true , 'g ' => true , 'h ' => true , 'i ' => true ,
18+ 'j ' => true , 'k ' => true , 'l ' => true , 'm ' => true , 'n ' => true ,
19+ 'o ' => true , 'p ' => true , 'q ' => true , 'r ' => true , 's ' => true ,
20+ 't ' => true , 'u ' => true , 'v ' => true , 'w ' => true , 'x ' => true ,
21+ 'y ' => true , 'z ' => true , '_ ' => true ,
22+ ];
23+
24+ /** @var array Number characters */
25+ private $ numbers = [
26+ 0 => true , 1 => true , 2 => true , 3 => true , 4 => true , 5 => true ,
27+ 6 => true , 7 => true , 8 => true , 9 => true ,
28+ ];
29+
30+ /** @var array Characters that can start a number (ctor calculated) */
31+ private $ startNumber ;
32+
33+ /** @var array Valid identifier characters (ctor calculated) */
34+ private $ validIdentifier ;
35+
36+ /** @var array Map of simple single character tokens */
37+ private $ simpleTokens = [
38+ '. ' => 'dot ' ,
39+ '* ' => 'star ' ,
40+ '] ' => 'rbracket ' ,
41+ ', ' => 'comma ' ,
42+ ': ' => 'colon ' ,
43+ '@ ' => 'current ' ,
44+ '& ' => 'expref ' ,
45+ '( ' => 'lparen ' ,
46+ ') ' => 'rparen ' ,
47+ '{ ' => 'lbrace ' ,
48+ '} ' => 'rbrace ' ,
49+ ];
50+
51+ /** @var array Map of whitespace characters */
52+ private $ whitespace = [
53+ ' ' => 'skip ' ,
54+ "\t" => 'skip ' ,
55+ "\n" => 'skip ' ,
56+ "\r" => 'skip ' ,
4057 ];
4158
4259 public function __construct ()
4360 {
44- $ this ->regex = '(( ' . implode (')|( ' , array_keys ($ this ->tokens )) . ')) ' ;
45- $ this ->offsetToToken = array_values ($ this ->tokens );
61+ $ this ->validIdentifier = $ this ->startIdentifier + $ this ->numbers ;
62+ $ this ->startNumber = $ this ->numbers ;
63+ $ this ->startNumber ['- ' ] = true ;
4664 }
4765
4866 /**
@@ -56,101 +74,228 @@ public function __construct()
5674 */
5775 public function tokenize ($ input )
5876 {
59- if (! preg_match_all ( $ this -> regex , $ input, $ matches , PREG_SET_ORDER ) ) {
60- throw $ this -> throwSyntax ( ' Invalid expression ' , 0 , $ input ) ;
77+ if ($ input === '' ) {
78+ goto eof ;
6179 }
6280
63- $ offset = 0 ;
81+ $ chars = str_split ( $ input ) ;
6482 $ tokens = [];
65- foreach ($ matches as $ match ) {
66- $ type = $ this ->offsetToToken [count ($ match ) - 2 ];
67- if ($ type !== 'skip ' ) {
68- $ token = ['type ' => $ type , 'value ' => $ match [0 ], 'pos ' => $ offset ];
69- switch ($ token ['type ' ]) {
70- case 'quoted_identifier ' :
71- $ token ['value ' ] = $ this ->decodeJson (
72- $ token ['value ' ], $ offset , $ input
73- );
74- break ;
75- case 'number ' :
76- $ token ['value ' ] = (int ) $ token ['value ' ];
77- break ;
78- case 'literal ' :
79- $ token ['value ' ] = $ this ->literal (
80- $ token ['value ' ], $ offset , $ input
81- );
82- break ;
83- case 'raw_string ' :
84- $ token ['type ' ] = 'literal ' ;
85- $ token ['value ' ] = substr ($ token ['value ' ], 1 , -1 );
86- $ token ['value ' ] = str_replace ("\\' " , "' " , $ token ['value ' ]);
87- break ;
88- }
89- $ tokens [] = $ token ;
83+
84+ consume:
85+
86+ $ current = current ($ chars );
87+
88+ if ($ current === false ) {
89+ goto eof;
90+ }
91+
92+ if (isset ($ this ->simpleTokens [$ current ])) {
93+ // Consume simple tokens like ".", ",", "@", etc.
94+ $ tokens [] = [
95+ 'type ' => $ this ->simpleTokens [$ current ],
96+ 'pos ' => key ($ chars ),
97+ 'value ' => $ current
98+ ];
99+ next ($ chars );
100+ } elseif (isset ($ this ->whitespace [$ current ])) {
101+ // Skip whitespace
102+ next ($ chars );
103+ } elseif (isset ($ this ->startIdentifier [$ current ])) {
104+ // Consume identifiers
105+ $ start = key ($ chars );
106+ $ buffer = '' ;
107+ do {
108+ $ buffer .= $ current ;
109+ $ current = next ($ chars );
110+ } while ($ current !== false && isset ($ this ->validIdentifier [$ current ]));
111+ $ tokens [] = [
112+ 'type ' => 'identifier ' ,
113+ 'value ' => $ buffer ,
114+ 'pos ' => $ start
115+ ];
116+ } elseif (isset ($ this ->startNumber [$ current ])) {
117+ // Consume numbers
118+ $ start = key ($ chars );
119+ $ buffer = '' ;
120+ do {
121+ $ buffer .= $ current ;
122+ $ current = next ($ chars );
123+ } while ($ current !== false && isset ($ this ->numbers [$ current ]));
124+ $ tokens [] = [
125+ 'type ' => 'number ' ,
126+ 'value ' => (int ) $ buffer ,
127+ 'pos ' => $ start
128+ ];
129+ } elseif ($ current === '| ' ) {
130+ // Consume pipe and OR
131+ $ tokens [] = $ this ->matchOr ($ chars , '| ' , '| ' , 'or ' , 'pipe ' );
132+ } elseif ($ current === '[ ' ) {
133+ // Consume "[", "[?", and "[]"
134+ $ position = key ($ chars );
135+ $ actual = next ($ chars );
136+ if ($ actual === '] ' ) {
137+ next ($ chars );
138+ $ tokens [] = [
139+ 'type ' => 'flatten ' ,
140+ 'pos ' => $ position ,
141+ 'value ' => '[] '
142+ ];
143+ } elseif ($ actual === '? ' ) {
144+ next ($ chars );
145+ $ tokens [] = [
146+ 'type ' => 'filter ' ,
147+ 'pos ' => $ position ,
148+ 'value ' => '[? '
149+ ];
150+ } else {
151+ $ tokens [] = [
152+ 'type ' => 'lbracket ' ,
153+ 'pos ' => $ position ,
154+ 'value ' => '[ '
155+ ];
156+ }
157+ } elseif ($ current === "' " ) {
158+ // Consume raw string literals
159+ $ tokens [] = $ this ->inside ($ chars , "' " , 'literal ' );
160+ } elseif ($ current === "` " ) {
161+ // Consume JSON literals
162+ $ token = $ this ->inside ($ chars , '` ' , 'literal ' );
163+ if ($ token ['type ' ] === 'literal ' ) {
164+ $ token ['value ' ] = str_replace ('\\` ' , '` ' , $ token ['value ' ]);
165+ $ token = $ this ->parseJson ($ token );
90166 }
91- $ offset += strlen ($ match [0 ]);
167+ $ tokens [] = $ token ;
168+ } elseif ($ current === '" ' ) {
169+ // Consume quoted identifiers
170+ $ token = $ this ->inside ($ chars , '" ' , 'quoted_identifier ' );
171+ if ($ token ['type ' ] === 'quoted_identifier ' ) {
172+ $ token ['value ' ] = '" ' . $ token ['value ' ] . '" ' ;
173+ $ token = $ this ->parseJson ($ token );
174+ }
175+ $ tokens [] = $ token ;
176+ } elseif ($ current === '! ' ) {
177+ // Consume not equal
178+ $ tokens [] = $ this ->matchOr ($ chars , '! ' , '= ' , 'comparator ' , 'unknown ' );
179+ } elseif ($ current === '> ' || $ current === '< ' ) {
180+ // Consume less than and greater than
181+ $ tokens [] = $ this ->matchOr ($ chars , $ current , '= ' , 'comparator ' , 'comparator ' );
182+ } elseif ($ current === '= ' ) {
183+ // Consume equals
184+ $ tokens [] = $ this ->matchOr ($ chars , '= ' , '= ' , 'comparator ' , 'unknown ' );
185+ } else {
186+ $ tokens [] = [
187+ 'type ' => 'unknown ' ,
188+ 'pos ' => key ($ chars ),
189+ 'value ' => $ current
190+ ];
191+ next ($ chars );
92192 }
93193
94- $ tokens [] = [ ' type ' => ' eof ' , ' pos ' => $ offset , ' value ' => null ] ;
194+ goto consume ;
95195
96- if (strlen ($ input ) != $ offset ) {
97- $ this ->invalidExpression ($ input );
196+ eof: {
197+ $ tokens [] = [
198+ 'type ' => 'eof ' ,
199+ 'pos ' => strlen ($ input ),
200+ 'value ' => null
201+ ];
202+ return $ tokens ;
98203 }
99-
100- return $ tokens ;
101204 }
102205
103- private function literal ($ value , $ offset , $ input )
206+ /**
207+ * Returns a token based on whether or not the next token matches the
208+ * expected value. If it does, a token of "$type" is returned. Otherwise,
209+ * a token of "$orElse" type is returned.
210+ *
211+ * @param array $chars Array of characters by reference.
212+ * @param string $current The current character.
213+ * @param string $expected Expected character.
214+ * @param string $type Expected result type.
215+ * @param string $orElse Otherwise return a token of this type.
216+ *
217+ * @return array Returns a conditional token.
218+ */
219+ private function matchOr (array &$ chars , $ current , $ expected , $ type , $ orElse )
104220 {
105- // Handles true, false, null, numbers, quoted strings, "[", and "{"
106- static $ valid = '/(true|false|null)|(^[\["{])|(^\-?[0-9]*(\.[0-9]+)?([e|E][+|\-][0-9]+)?$)/ ' ;
107- $ value = str_replace ('\\` ' , '` ' , ltrim (substr ($ value , 1 , -1 )));
221+ $ position = key ($ chars );
222+ $ actual = next ($ chars );
223+
224+ if ($ actual === $ expected ) {
225+ next ($ chars );
226+ return [
227+ 'type ' => $ type ,
228+ 'pos ' => $ position ,
229+ 'value ' => $ current . $ expected
230+ ];
231+ }
108232
109- return preg_match ($ valid , $ value ) && $ value !== ''
110- ? $ this ->decodeJson ($ value , $ offset , $ input )
111- : $ this ->decodeJson ('" ' . $ value . '" ' , $ offset , $ input );
233+ return [
234+ 'type ' => $ orElse ,
235+ 'pos ' => $ position ,
236+ 'value ' => $ current
237+ ];
112238 }
113239
114- private function decodeJson ($ json , $ offset , $ input )
240+ /**
241+ * Returns a token the is the result of consuming inside of delimiter
242+ * characters. Escaped delimiters will be adjusted before returning a
243+ * value. If the token is not closed, "unknown" is returned.
244+ *
245+ * @param array $chars Array of characters by reference.
246+ * @param string $delim The delimiter character.
247+ * @param string $type Token type.
248+ *
249+ * @return array Returns the consumed token.
250+ */
251+ private function inside (array &$ chars , $ delim , $ type )
115252 {
116- static $ errs = [
117- JSON_ERROR_DEPTH => 'JSON_ERROR_DEPTH ' ,
118- JSON_ERROR_STATE_MISMATCH => 'JSON_ERROR_STATE_MISMATCH ' ,
119- JSON_ERROR_CTRL_CHAR => 'JSON_ERROR_CTRL_CHAR ' ,
120- JSON_ERROR_SYNTAX => 'JSON_ERROR_SYNTAX ' ,
121- JSON_ERROR_UTF8 => 'JSON_ERROR_UTF8 '
122- ];
253+ $ position = key ($ chars );
254+ $ current = next ($ chars );
255+ $ buffer = '' ;
123256
124- $ value = json_decode ($ json , true );
257+ while ($ current !== $ delim ) {
258+
259+ if ($ current === '\\' ) {
260+ $ buffer .= '\\' ;
261+ $ current = next ($ chars );
262+ }
263+
264+ if ($ current === false ) {
265+ return [
266+ 'type ' => 'unknown ' ,
267+ 'value ' => $ buffer ,
268+ 'pos ' => $ position
269+ ];
270+ }
271+
272+ $ buffer .= $ current ;
273+ $ current = next ($ chars );
125274
126- if ($ error = json_last_error ()) {
127- $ message = isset ($ errs [$ error ]) ? $ errs [$ error ] : 'Unknown error ' ;
128- throw $ this ->throwSyntax (
129- "Error decoding JSON: ( {$ error }) {$ message }, given {$ json }" ,
130- $ offset ,
131- $ input
132- );
133275 }
134276
135- return $ value ;
136- }
277+ next ($ chars );
137278
138- private function throwSyntax ($ message , $ offset , $ input )
139- {
140- return new SyntaxErrorException (
141- $ message ,
142- ['value ' => substr ($ input , $ offset , 1 ), 'pos ' => $ offset ],
143- $ input
144- );
279+ return ['type ' => $ type , 'value ' => $ buffer , 'pos ' => $ position ];
145280 }
146281
147- private function invalidExpression ($ input )
282+ /**
283+ * Parses a JSON token or sets the token type to "unknown" on error.
284+ *
285+ * @param array $token Token that needs parsing.
286+ *
287+ * @return array Returns a token with a parsed value.
288+ */
289+ private function parseJson (array $ token )
148290 {
149- $ offset = 0 ;
150- while (preg_match ("{$ this ->regex }A " , $ input , $ matches , 0 , $ offset )) {
151- $ offset += strlen ($ matches [0 ]);
291+ $ value = json_decode ($ token ['value ' ], true );
292+
293+ if ($ error = json_last_error ()) {
294+ $ token ['type ' ] = 'unknown ' ;
295+ return $ token ;
152296 }
153297
154- throw $ this ->throwSyntax ('Unexpected character ' , $ offset , $ input );
298+ $ token ['value ' ] = $ value ;
299+ return $ token ;
155300 }
156301}
0 commit comments