5
5
from jmespath .exceptions import LexerError , EmptyExpressionError
6
6
7
7
8
- class Scanner (object ):
9
- def __init__ (self , expression ):
10
- if not expression :
11
- raise EmptyExpressionError ()
12
- self .expression = expression
13
- self .pos = 0
14
- self .chars = list (self .expression )
15
- self .len = len (self .expression )
16
- self .current = self .chars [self .pos ]
17
-
18
- def next (self ):
19
- if self .pos == self .len - 1 :
20
- self .current = None
21
- else :
22
- self .pos += 1
23
- self .current = self .chars [self .pos ]
24
- return self .current
25
-
26
- def in_delimiter (self , delimiter ):
27
- start = self .pos
28
- buff = ''
29
- self .next ()
30
- while self .current != delimiter :
31
- if self .current == '\\ ' :
32
- buff += '\\ '
33
- self .next ()
34
- if self .current is None :
35
- raise LexerError (lexer_position = start ,
36
- lexer_value = self .expression ,
37
- message = "Unclosed %s delimiter" % delimiter )
38
- buff += self .current
39
- self .next ()
40
- # Skip the closing delimiter.
41
- self .next ()
42
- return buff
43
-
44
-
45
8
class Lexer (object ):
46
9
START_IDENTIFIER = set (string .ascii_letters + '_' )
47
10
VALID_IDENTIFIER = set (string .ascii_letters + string .digits + '_' )
@@ -63,69 +26,104 @@ class Lexer(object):
63
26
}
64
27
65
28
def tokenize (self , expression ):
66
- scanner = Scanner (expression )
67
- while scanner . current is not None :
68
- if scanner . current in self .SIMPLE_TOKENS :
69
- yield {'type' : self .SIMPLE_TOKENS [scanner . current ],
70
- 'value' : scanner . current ,
71
- 'start' : scanner . pos , 'end' : scanner . pos + 1 }
72
- scanner . next ()
73
- elif scanner . current in self .START_IDENTIFIER :
74
- start = scanner . pos
75
- buff = scanner . current
76
- while scanner . next () in self .VALID_IDENTIFIER :
77
- buff += scanner . current
29
+ self . _init_expr (expression )
30
+ while self . _current is not None :
31
+ if self . _current in self .SIMPLE_TOKENS :
32
+ yield {'type' : self .SIMPLE_TOKENS [self . _current ],
33
+ 'value' : self . _current ,
34
+ 'start' : self . _pos , 'end' : self . _pos + 1 }
35
+ self . _next ()
36
+ elif self . _current in self .START_IDENTIFIER :
37
+ start = self . _pos
38
+ buff = self . _current
39
+ while self . _next () in self .VALID_IDENTIFIER :
40
+ buff += self . _current
78
41
yield {'type' : 'unquoted_identifier' , 'value' : buff ,
79
42
'start' : start , 'end' : start + len (buff )}
80
- elif scanner . current in self .WHITESPACE :
81
- scanner . next ()
82
- elif scanner . current == '[' :
83
- start = scanner . pos
84
- next_char = scanner . next ()
43
+ elif self . _current in self .WHITESPACE :
44
+ self . _next ()
45
+ elif self . _current == '[' :
46
+ start = self . _pos
47
+ next_char = self . _next ()
85
48
if next_char == ']' :
86
- scanner . next ()
49
+ self . _next ()
87
50
yield {'type' : 'flatten' , 'value' : '[]' ,
88
51
'start' : start , 'end' : start + 2 }
89
52
elif next_char == '?' :
90
- scanner . next ()
53
+ self . _next ()
91
54
yield {'type' : 'filter' , 'value' : '[?' ,
92
55
'start' : start , 'end' : start + 2 }
93
56
else :
94
57
yield {'type' : 'lbracket' , 'value' : '[' ,
95
58
'start' : start , 'end' : start + 1 }
96
- elif scanner . current == "'" :
97
- yield self ._consume_raw_string_literal (scanner )
98
- elif scanner . current == '|' :
99
- yield self ._match_or_else (scanner , '|' , 'or' , 'pipe' )
100
- elif scanner . current == '`' :
101
- yield self ._consume_literal (scanner )
102
- elif scanner . current in self .START_NUMBER :
103
- start = scanner . pos
104
- buff = scanner . current
105
- while scanner . next () in self .VALID_NUMBER :
106
- buff += scanner . current
59
+ elif self . _current == "'" :
60
+ yield self ._consume_raw_string_literal ()
61
+ elif self . _current == '|' :
62
+ yield self ._match_or_else ('|' , 'or' , 'pipe' )
63
+ elif self . _current == '`' :
64
+ yield self ._consume_literal ()
65
+ elif self . _current in self .START_NUMBER :
66
+ start = self . _pos
67
+ buff = self . _current
68
+ while self . _next () in self .VALID_NUMBER :
69
+ buff += self . _current
107
70
yield {'type' : 'number' , 'value' : int (buff ),
108
71
'start' : start , 'end' : start + len (buff )}
109
- elif scanner . current == '"' :
110
- yield self ._consume_quoted_identifier (scanner )
111
- elif scanner . current == '<' :
112
- yield self ._match_or_else (scanner , '=' , 'lte' , 'lt' )
113
- elif scanner . current == '>' :
114
- yield self ._match_or_else (scanner , '=' , 'gte' , 'gt' )
115
- elif scanner . current == '!' :
116
- yield self ._match_or_else (scanner , '=' , 'ne' , 'unknown' )
117
- elif scanner . current == '=' :
118
- yield self ._match_or_else (scanner , '=' , 'eq' , 'unknown' )
72
+ elif self . _current == '"' :
73
+ yield self ._consume_quoted_identifier ()
74
+ elif self . _current == '<' :
75
+ yield self ._match_or_else ('=' , 'lte' , 'lt' )
76
+ elif self . _current == '>' :
77
+ yield self ._match_or_else ('=' , 'gte' , 'gt' )
78
+ elif self . _current == '!' :
79
+ yield self ._match_or_else ('=' , 'ne' , 'unknown' )
80
+ elif self . _current == '=' :
81
+ yield self ._match_or_else ('=' , 'eq' , 'unknown' )
119
82
else :
120
- raise LexerError (lexer_position = scanner . pos ,
121
- lexer_value = scanner . current ,
122
- message = "Unknown token %s" % scanner . current )
83
+ raise LexerError (lexer_position = self . _pos ,
84
+ lexer_value = self . _current ,
85
+ message = "Unknown token %s" % self . _current )
123
86
yield {'type' : 'eof' , 'value' : '' ,
124
- 'start' : len (expression ), 'end' : len (expression )}
87
+ 'start' : self ._len , 'end' : self ._len }
88
+
89
+ def _init_expr (self , expression ):
90
+ if not expression :
91
+ raise EmptyExpressionError ()
92
+ self ._pos = 0
93
+ self ._expression = expression
94
+ self ._chars = list (self ._expression )
95
+ self ._current = self ._chars [self ._pos ]
96
+ self ._len = len (self ._expression )
97
+
98
+ def _next (self ):
99
+ if self ._pos == self ._len - 1 :
100
+ self ._current = None
101
+ else :
102
+ self ._pos += 1
103
+ self ._current = self ._chars [self ._pos ]
104
+ return self ._current
105
+
106
+ def _in_delimiter (self , delimiter ):
107
+ start = self ._pos
108
+ buff = ''
109
+ self ._next ()
110
+ while self ._current != delimiter :
111
+ if self ._current == '\\ ' :
112
+ buff += '\\ '
113
+ self ._next ()
114
+ if self ._current is None :
115
+ raise LexerError (lexer_position = start ,
116
+ lexer_value = self ._expression ,
117
+ message = "Unclosed %s delimiter" % delimiter )
118
+ buff += self ._current
119
+ self ._next ()
120
+ # Skip the closing delimiter.
121
+ self ._next ()
122
+ return buff
125
123
126
- def _consume_literal (self , scanner ):
127
- start = scanner . pos
128
- lexeme = scanner . in_delimiter ('`' )
124
+ def _consume_literal (self ):
125
+ start = self . _pos
126
+ lexeme = self . _in_delimiter ('`' )
129
127
lexeme = lexeme .replace ('\\ `' , '`' )
130
128
try :
131
129
# Assume it is valid JSON and attempt to parse.
@@ -139,17 +137,17 @@ def _consume_literal(self, scanner):
139
137
PendingDeprecationWarning )
140
138
except ValueError :
141
139
raise LexerError (lexer_position = start ,
142
- lexer_value = lexeme ,
140
+ lexer_value = self . _expression ,
143
141
message = "Bad token %s" % lexeme )
144
- token_len = scanner . pos - start
142
+ token_len = self . _pos - start
145
143
return {'type' : 'literal' , 'value' : parsed_json ,
146
144
'start' : start , 'end' : token_len }
147
145
148
- def _consume_quoted_identifier (self , scanner ):
149
- start = scanner . pos
150
- lexeme = '"' + scanner . in_delimiter ('"' ) + '"'
146
+ def _consume_quoted_identifier (self ):
147
+ start = self . _pos
148
+ lexeme = '"' + self . _in_delimiter ('"' ) + '"'
151
149
try :
152
- token_len = scanner . pos - start
150
+ token_len = self . _pos - start
153
151
return {'type' : 'quoted_identifier' , 'value' : loads (lexeme ),
154
152
'start' : start , 'end' : token_len }
155
153
except ValueError as e :
@@ -158,19 +156,19 @@ def _consume_quoted_identifier(self, scanner):
158
156
lexer_value = lexeme ,
159
157
message = error_message )
160
158
161
- def _consume_raw_string_literal (self , scanner ):
162
- start = scanner . pos
163
- lexeme = scanner . in_delimiter ("'" )
164
- token_len = scanner . pos - start
159
+ def _consume_raw_string_literal (self ):
160
+ start = self . _pos
161
+ lexeme = self . _in_delimiter ("'" )
162
+ token_len = self . _pos - start
165
163
return {'type' : 'literal' , 'value' : lexeme ,
166
164
'start' : start , 'end' : token_len }
167
165
168
- def _match_or_else (self , scanner , expected , match_type , else_type ):
169
- start = scanner . pos
170
- current = scanner . current
171
- next_char = scanner . next ()
166
+ def _match_or_else (self , expected , match_type , else_type ):
167
+ start = self . _pos
168
+ current = self . _current
169
+ next_char = self . _next ()
172
170
if next_char == expected :
173
- scanner . next ()
171
+ self . _next ()
174
172
return {'type' : match_type , 'value' : current + next_char ,
175
173
'start' : start , 'end' : start + 1 }
176
174
return {'type' : else_type , 'value' : current ,
0 commit comments