5
5
from jmespath .exceptions import LexerError , EmptyExpressionError
6
6
7
7
8
- START_IDENTIFIER = set (string .ascii_letters + '_' )
9
- VALID_IDENTIFIER = set (string .ascii_letters + string .digits + '_' )
10
- START_NUMBER = set (string .digits + '-' )
11
- VALID_NUMBER = set (string .digits )
12
- WHITESPACE = set (" \t \n \r " )
13
- SIMPLE_TOKENS = {
14
- '.' : 'dot' ,
15
- '*' : 'star' ,
16
- ']' : 'rbracket' ,
17
- ',' : 'comma' ,
18
- ':' : 'colon' ,
19
- '@' : 'current' ,
20
- '&' : 'expref' ,
21
- '(' : 'lparen' ,
22
- ')' : 'rparen' ,
23
- '{' : 'lbrace' ,
24
- '}' : 'rbrace'
25
- }
26
-
27
-
28
8
class Scanner (object ):
29
9
def __init__ (self , expression ):
30
10
if not expression :
@@ -45,68 +25,87 @@ def next(self):
45
25
46
26
def in_delimiter (self , delimiter ):
47
27
start = self .pos
48
- buffer = ''
28
+ buff = ''
49
29
self .next ()
50
30
while self .current != delimiter :
51
31
if self .current == '\\ ' :
52
- buffer += '\\ '
32
+ buff += '\\ '
53
33
self .next ()
54
34
if self .current is None :
55
- print (buffer )
56
35
raise LexerError (lexer_position = start ,
57
36
lexer_value = self .expression ,
58
- message = "Unclosed delimiter: %s " % buffer )
59
- buffer += self .current
37
+ message = "Unclosed %s delimiter " % delimiter )
38
+ buff += self .current
60
39
self .next ()
40
+ # Skip the closing delimiter.
61
41
self .next ()
62
- return buffer
42
+ return buff
63
43
64
44
65
45
class Lexer (object ):
46
+ START_IDENTIFIER = set (string .ascii_letters + '_' )
47
+ VALID_IDENTIFIER = set (string .ascii_letters + string .digits + '_' )
48
+ START_NUMBER = set (string .digits + '-' )
49
+ VALID_NUMBER = set (string .digits )
50
+ WHITESPACE = set (" \t \n \r " )
51
+ SIMPLE_TOKENS = {
52
+ '.' : 'dot' ,
53
+ '*' : 'star' ,
54
+ ']' : 'rbracket' ,
55
+ ',' : 'comma' ,
56
+ ':' : 'colon' ,
57
+ '@' : 'current' ,
58
+ '&' : 'expref' ,
59
+ '(' : 'lparen' ,
60
+ ')' : 'rparen' ,
61
+ '{' : 'lbrace' ,
62
+ '}' : 'rbrace'
63
+ }
64
+
66
65
def tokenize (self , expression ):
67
66
scanner = Scanner (expression )
68
67
while scanner .current is not None :
69
- if scanner .current in SIMPLE_TOKENS :
70
- yield {'type' : SIMPLE_TOKENS [scanner .current ],
68
+ if scanner .current in self . SIMPLE_TOKENS :
69
+ yield {'type' : self . SIMPLE_TOKENS [scanner .current ],
71
70
'value' : scanner .current ,
72
- 'start' : scanner .pos , 'end' : scanner .pos }
71
+ 'start' : scanner .pos , 'end' : scanner .pos + 1 }
73
72
scanner .next ()
74
- elif scanner .current in START_IDENTIFIER :
73
+ elif scanner .current in self . START_IDENTIFIER :
75
74
start = scanner .pos
76
- buffer = scanner .current
77
- while scanner .next () in VALID_IDENTIFIER :
78
- buffer += scanner .current
79
- yield {'type' : 'unquoted_identifier' , 'value' : buffer ,
80
- 'start' : start , 'end' : len (buffer )}
81
- elif scanner .current in WHITESPACE :
75
+ buff = scanner .current
76
+ while scanner .next () in self . VALID_IDENTIFIER :
77
+ buff += scanner .current
78
+ yield {'type' : 'unquoted_identifier' , 'value' : buff ,
79
+ 'start' : start , 'end' : start + len (buff )}
80
+ elif scanner .current in self . WHITESPACE :
82
81
scanner .next ()
83
82
elif scanner .current == '[' :
84
83
start = scanner .pos
85
84
next_char = scanner .next ()
86
85
if next_char == ']' :
87
86
scanner .next ()
88
87
yield {'type' : 'flatten' , 'value' : '[]' ,
89
- 'start' : start , 'end' : start + 1 }
88
+ 'start' : start , 'end' : start + 2 }
90
89
elif next_char == '?' :
91
90
scanner .next ()
92
91
yield {'type' : 'filter' , 'value' : '[?' ,
93
- 'start' : start , 'end' : start + 1 }
92
+ 'start' : start , 'end' : start + 2 }
94
93
else :
95
94
yield {'type' : 'lbracket' , 'value' : '[' ,
96
- 'start' : start , 'end' : start }
95
+ 'start' : start , 'end' : start + 1 }
97
96
elif scanner .current == "'" :
98
97
yield self ._consume_raw_string_literal (scanner )
99
98
elif scanner .current == '|' :
100
99
yield self ._match_or_else (scanner , '|' , 'or' , 'pipe' )
101
100
elif scanner .current == '`' :
102
101
yield self ._consume_literal (scanner )
103
- elif scanner .current in START_NUMBER :
102
+ elif scanner .current in self . START_NUMBER :
104
103
start = scanner .pos
105
- buffer = scanner .current
106
- while scanner .next () in VALID_NUMBER :
107
- buffer += scanner .current
108
- yield {'type' : 'number' , 'value' : int (buffer ),
109
- 'start' : start , 'end' : len (buffer )}
104
+ buff = scanner .current
105
+ while scanner .next () in self . VALID_NUMBER :
106
+ buff += scanner .current
107
+ yield {'type' : 'number' , 'value' : int (buff ),
108
+ 'start' : start , 'end' : start + len (buff )}
110
109
elif scanner .current == '"' :
111
110
yield self ._consume_quoted_identifier (scanner )
112
111
elif scanner .current == '<' :
@@ -118,15 +117,16 @@ def tokenize(self, expression):
118
117
elif scanner .current == '=' :
119
118
yield self ._match_or_else (scanner , '=' , 'eq' , 'unknown' )
120
119
else :
121
- yield { 'type' : 'unknown' , 'value' : scanner .current ,
122
- 'start' : scanner . pos , 'end' : scanner .pos }
123
- scanner .next ( )
120
+ raise LexerError ( lexer_position = scanner .pos ,
121
+ lexer_value = scanner .current ,
122
+ message = "Unknown token %s" % scanner .current )
124
123
yield {'type' : 'eof' , 'value' : '' ,
125
124
'start' : len (expression ), 'end' : len (expression )}
126
125
127
126
def _consume_literal (self , scanner ):
128
127
start = scanner .pos
129
128
lexeme = scanner .in_delimiter ('`' )
129
+ lexeme = lexeme .replace ('\\ `' , '`' )
130
130
try :
131
131
# Assume it is valid JSON and attempt to parse.
132
132
parsed_json = loads (lexeme )
@@ -141,15 +141,17 @@ def _consume_literal(self, scanner):
141
141
raise LexerError (lexer_position = start ,
142
142
lexer_value = lexeme ,
143
143
message = "Bad token %s" % lexeme )
144
+ token_len = scanner .pos - start
144
145
return {'type' : 'literal' , 'value' : parsed_json ,
145
- 'start' : start , 'end' : len ( lexeme ) }
146
+ 'start' : start , 'end' : token_len }
146
147
147
148
def _consume_quoted_identifier (self , scanner ):
148
149
start = scanner .pos
149
- lexeme = scanner .in_delimiter ('"' )
150
+ lexeme = '"' + scanner .in_delimiter ('"' ) + '"'
150
151
try :
152
+ token_len = scanner .pos - start
151
153
return {'type' : 'quoted_identifier' , 'value' : loads (lexeme ),
152
- 'start' : start , 'end' : len ( lexeme ) }
154
+ 'start' : start , 'end' : token_len }
153
155
except ValueError as e :
154
156
error_message = str (e ).split (':' )[0 ]
155
157
raise LexerError (lexer_position = start ,
@@ -159,8 +161,9 @@ def _consume_quoted_identifier(self, scanner):
159
161
def _consume_raw_string_literal (self , scanner ):
160
162
start = scanner .pos
161
163
lexeme = scanner .in_delimiter ("'" )
164
+ token_len = scanner .pos - start
162
165
return {'type' : 'literal' , 'value' : lexeme ,
163
- 'start' : start , 'end' : len ( lexeme ) }
166
+ 'start' : start , 'end' : token_len }
164
167
165
168
def _match_or_else (self , scanner , expected , match_type , else_type ):
166
169
start = scanner .pos
0 commit comments