Skip to content

Commit a234f73

Browse files
committed
Using a stateful lexer rather than a Scanner object
1 parent 1f0ad9f commit a234f73

File tree

1 file changed

+97
-99
lines changed

1 file changed

+97
-99
lines changed

jmespath/lexer.py

Lines changed: 97 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -5,43 +5,6 @@
55
from jmespath.exceptions import LexerError, EmptyExpressionError
66

77

8-
class Scanner(object):
9-
def __init__(self, expression):
10-
if not expression:
11-
raise EmptyExpressionError()
12-
self.expression = expression
13-
self.pos = 0
14-
self.chars = list(self.expression)
15-
self.len = len(self.expression)
16-
self.current = self.chars[self.pos]
17-
18-
def next(self):
19-
if self.pos == self.len - 1:
20-
self.current = None
21-
else:
22-
self.pos += 1
23-
self.current = self.chars[self.pos]
24-
return self.current
25-
26-
def in_delimiter(self, delimiter):
27-
start = self.pos
28-
buff = ''
29-
self.next()
30-
while self.current != delimiter:
31-
if self.current == '\\':
32-
buff += '\\'
33-
self.next()
34-
if self.current is None:
35-
raise LexerError(lexer_position=start,
36-
lexer_value=self.expression,
37-
message="Unclosed %s delimiter" % delimiter)
38-
buff += self.current
39-
self.next()
40-
# Skip the closing delimiter.
41-
self.next()
42-
return buff
43-
44-
458
class Lexer(object):
469
START_IDENTIFIER = set(string.ascii_letters + '_')
4710
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
@@ -63,69 +26,104 @@ class Lexer(object):
6326
}
6427

6528
def tokenize(self, expression):
66-
scanner = Scanner(expression)
67-
while scanner.current is not None:
68-
if scanner.current in self.SIMPLE_TOKENS:
69-
yield {'type': self.SIMPLE_TOKENS[scanner.current],
70-
'value': scanner.current,
71-
'start': scanner.pos, 'end': scanner.pos + 1}
72-
scanner.next()
73-
elif scanner.current in self.START_IDENTIFIER:
74-
start = scanner.pos
75-
buff = scanner.current
76-
while scanner.next() in self.VALID_IDENTIFIER:
77-
buff += scanner.current
29+
self._init_expr(expression)
30+
while self._current is not None:
31+
if self._current in self.SIMPLE_TOKENS:
32+
yield {'type': self.SIMPLE_TOKENS[self._current],
33+
'value': self._current,
34+
'start': self._pos, 'end': self._pos + 1}
35+
self._next()
36+
elif self._current in self.START_IDENTIFIER:
37+
start = self._pos
38+
buff = self._current
39+
while self._next() in self.VALID_IDENTIFIER:
40+
buff += self._current
7841
yield {'type': 'unquoted_identifier', 'value': buff,
7942
'start': start, 'end': start + len(buff)}
80-
elif scanner.current in self.WHITESPACE:
81-
scanner.next()
82-
elif scanner.current == '[':
83-
start = scanner.pos
84-
next_char = scanner.next()
43+
elif self._current in self.WHITESPACE:
44+
self._next()
45+
elif self._current == '[':
46+
start = self._pos
47+
next_char = self._next()
8548
if next_char == ']':
86-
scanner.next()
49+
self._next()
8750
yield {'type': 'flatten', 'value': '[]',
8851
'start': start, 'end': start + 2}
8952
elif next_char == '?':
90-
scanner.next()
53+
self._next()
9154
yield {'type': 'filter', 'value': '[?',
9255
'start': start, 'end': start + 2}
9356
else:
9457
yield {'type': 'lbracket', 'value': '[',
9558
'start': start, 'end': start + 1}
96-
elif scanner.current == "'":
97-
yield self._consume_raw_string_literal(scanner)
98-
elif scanner.current == '|':
99-
yield self._match_or_else(scanner, '|', 'or', 'pipe')
100-
elif scanner.current == '`':
101-
yield self._consume_literal(scanner)
102-
elif scanner.current in self.START_NUMBER:
103-
start = scanner.pos
104-
buff = scanner.current
105-
while scanner.next() in self.VALID_NUMBER:
106-
buff += scanner.current
59+
elif self._current == "'":
60+
yield self._consume_raw_string_literal()
61+
elif self._current == '|':
62+
yield self._match_or_else('|', 'or', 'pipe')
63+
elif self._current == '`':
64+
yield self._consume_literal()
65+
elif self._current in self.START_NUMBER:
66+
start = self._pos
67+
buff = self._current
68+
while self._next() in self.VALID_NUMBER:
69+
buff += self._current
10770
yield {'type': 'number', 'value': int(buff),
10871
'start': start, 'end': start + len(buff)}
109-
elif scanner.current == '"':
110-
yield self._consume_quoted_identifier(scanner)
111-
elif scanner.current == '<':
112-
yield self._match_or_else(scanner, '=', 'lte', 'lt')
113-
elif scanner.current == '>':
114-
yield self._match_or_else(scanner, '=', 'gte', 'gt')
115-
elif scanner.current == '!':
116-
yield self._match_or_else(scanner, '=', 'ne', 'unknown')
117-
elif scanner.current == '=':
118-
yield self._match_or_else(scanner, '=', 'eq', 'unknown')
72+
elif self._current == '"':
73+
yield self._consume_quoted_identifier()
74+
elif self._current == '<':
75+
yield self._match_or_else('=', 'lte', 'lt')
76+
elif self._current == '>':
77+
yield self._match_or_else('=', 'gte', 'gt')
78+
elif self._current == '!':
79+
yield self._match_or_else('=', 'ne', 'unknown')
80+
elif self._current == '=':
81+
yield self._match_or_else('=', 'eq', 'unknown')
11982
else:
120-
raise LexerError(lexer_position=scanner.pos,
121-
lexer_value=scanner.current,
122-
message="Unknown token %s" % scanner.current)
83+
raise LexerError(lexer_position=self._pos,
84+
lexer_value=self._current,
85+
message="Unknown token %s" % self._current)
12386
yield {'type': 'eof', 'value': '',
124-
'start': len(expression), 'end': len(expression)}
87+
'start': self._len, 'end': self._len}
88+
89+
def _init_expr(self, expression):
90+
if not expression:
91+
raise EmptyExpressionError()
92+
self._pos = 0
93+
self._expression = expression
94+
self._chars = list(self._expression)
95+
self._current = self._chars[self._pos]
96+
self._len = len(self._expression)
97+
98+
def _next(self):
99+
if self._pos == self._len - 1:
100+
self._current = None
101+
else:
102+
self._pos += 1
103+
self._current = self._chars[self._pos]
104+
return self._current
105+
106+
def _in_delimiter(self, delimiter):
107+
start = self._pos
108+
buff = ''
109+
self._next()
110+
while self._current != delimiter:
111+
if self._current == '\\':
112+
buff += '\\'
113+
self._next()
114+
if self._current is None:
115+
raise LexerError(lexer_position=start,
116+
lexer_value=self._expression,
117+
message="Unclosed %s delimiter" % delimiter)
118+
buff += self._current
119+
self._next()
120+
# Skip the closing delimiter.
121+
self._next()
122+
return buff
125123

126-
def _consume_literal(self, scanner):
127-
start = scanner.pos
128-
lexeme = scanner.in_delimiter('`')
124+
def _consume_literal(self):
125+
start = self._pos
126+
lexeme = self._in_delimiter('`')
129127
lexeme = lexeme.replace('\\`', '`')
130128
try:
131129
# Assume it is valid JSON and attempt to parse.
@@ -139,17 +137,17 @@ def _consume_literal(self, scanner):
139137
PendingDeprecationWarning)
140138
except ValueError:
141139
raise LexerError(lexer_position=start,
142-
lexer_value=lexeme,
140+
lexer_value=self._expression,
143141
message="Bad token %s" % lexeme)
144-
token_len = scanner.pos - start
142+
token_len = self._pos - start
145143
return {'type': 'literal', 'value': parsed_json,
146144
'start': start, 'end': token_len}
147145

148-
def _consume_quoted_identifier(self, scanner):
149-
start = scanner.pos
150-
lexeme = '"' + scanner.in_delimiter('"') + '"'
146+
def _consume_quoted_identifier(self):
147+
start = self._pos
148+
lexeme = '"' + self._in_delimiter('"') + '"'
151149
try:
152-
token_len = scanner.pos - start
150+
token_len = self._pos - start
153151
return {'type': 'quoted_identifier', 'value': loads(lexeme),
154152
'start': start, 'end': token_len}
155153
except ValueError as e:
@@ -158,19 +156,19 @@ def _consume_quoted_identifier(self, scanner):
158156
lexer_value=lexeme,
159157
message=error_message)
160158

161-
def _consume_raw_string_literal(self, scanner):
162-
start = scanner.pos
163-
lexeme = scanner.in_delimiter("'")
164-
token_len = scanner.pos - start
159+
def _consume_raw_string_literal(self):
160+
start = self._pos
161+
lexeme = self._in_delimiter("'")
162+
token_len = self._pos - start
165163
return {'type': 'literal', 'value': lexeme,
166164
'start': start, 'end': token_len}
167165

168-
def _match_or_else(self, scanner, expected, match_type, else_type):
169-
start = scanner.pos
170-
current = scanner.current
171-
next_char = scanner.next()
166+
def _match_or_else(self, expected, match_type, else_type):
167+
start = self._pos
168+
current = self._current
169+
next_char = self._next()
172170
if next_char == expected:
173-
scanner.next()
171+
self._next()
174172
return {'type': match_type, 'value': current + next_char,
175173
'start': start, 'end': start + 1}
176174
return {'type': else_type, 'value': current,

0 commit comments

Comments
 (0)