Skip to content

Commit b0c1004

Browse files
committed
Merge branch 'mtdowling-lexer-perf' into develop
* mtdowling-lexer-perf: Rename variables to be full words Using a stateful lexer rather than a Scanner object Lexer cleanup and test fixes Lexer bug fixes Removing the computed table is just as fast but less code No longer using a regex based lexer
2 parents 2c6d516 + e7c337e commit b0c1004

File tree

3 files changed

+168
-134
lines changed

3 files changed

+168
-134
lines changed

jmespath/lexer.py

Lines changed: 156 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,148 +1,177 @@
1-
import re
1+
import string
22
import warnings
33
from json import loads
44

55
from jmespath.exceptions import LexerError, EmptyExpressionError
66

77

88
class Lexer(object):
9-
TOKENS = (
10-
r'(?P<number>-?\d+)|'
11-
r'(?P<unquoted_identifier>([a-zA-Z_][a-zA-Z_0-9]*))|'
12-
r'(?P<quoted_identifier>("(?:\\\\|\\"|[^"])*"))|'
13-
r'(?P<string_literal>(\'(?:\\\\|\\\'|[^\'])*\'))|'
14-
r'(?P<literal>(`(?:\\\\|\\`|[^`])*`))|'
15-
r'(?P<filter>\[\?)|'
16-
r'(?P<or>\|\|)|'
17-
r'(?P<pipe>\|)|'
18-
r'(?P<ne>!=)|'
19-
r'(?P<rbrace>\})|'
20-
r'(?P<eq>==)|'
21-
r'(?P<dot>\.)|'
22-
r'(?P<star>\*)|'
23-
r'(?P<gte>>=)|'
24-
r'(?P<lparen>\()|'
25-
r'(?P<lbrace>\{)|'
26-
r'(?P<lte><=)|'
27-
r'(?P<flatten>\[\])|'
28-
r'(?P<rbracket>\])|'
29-
r'(?P<lbracket>\[)|'
30-
r'(?P<rparen>\))|'
31-
r'(?P<comma>,)|'
32-
r'(?P<colon>:)|'
33-
r'(?P<lt><)|'
34-
r'(?P<expref>&)|'
35-
r'(?P<gt>>)|'
36-
r'(?P<current>@)|'
37-
r'(?P<skip>[ \t]+)'
38-
)
39-
40-
def __init__(self):
41-
self.master_regex = re.compile(self.TOKENS)
9+
START_IDENTIFIER = set(string.ascii_letters + '_')
10+
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
11+
START_NUMBER = set(string.digits + '-')
12+
VALID_NUMBER = set(string.digits)
13+
WHITESPACE = set(" \t\n\r")
14+
SIMPLE_TOKENS = {
15+
'.': 'dot',
16+
'*': 'star',
17+
']': 'rbracket',
18+
',': 'comma',
19+
':': 'colon',
20+
'@': 'current',
21+
'&': 'expref',
22+
'(': 'lparen',
23+
')': 'rparen',
24+
'{': 'lbrace',
25+
'}': 'rbrace'
26+
}
4227

4328
def tokenize(self, expression):
29+
self._initialize_for_expression(expression)
30+
while self._current is not None:
31+
if self._current in self.SIMPLE_TOKENS:
32+
yield {'type': self.SIMPLE_TOKENS[self._current],
33+
'value': self._current,
34+
'start': self._position, 'end': self._position + 1}
35+
self._next()
36+
elif self._current in self.START_IDENTIFIER:
37+
start = self._position
38+
buff = self._current
39+
while self._next() in self.VALID_IDENTIFIER:
40+
buff += self._current
41+
yield {'type': 'unquoted_identifier', 'value': buff,
42+
'start': start, 'end': start + len(buff)}
43+
elif self._current in self.WHITESPACE:
44+
self._next()
45+
elif self._current == '[':
46+
start = self._position
47+
next_char = self._next()
48+
if next_char == ']':
49+
self._next()
50+
yield {'type': 'flatten', 'value': '[]',
51+
'start': start, 'end': start + 2}
52+
elif next_char == '?':
53+
self._next()
54+
yield {'type': 'filter', 'value': '[?',
55+
'start': start, 'end': start + 2}
56+
else:
57+
yield {'type': 'lbracket', 'value': '[',
58+
'start': start, 'end': start + 1}
59+
elif self._current == "'":
60+
yield self._consume_raw_string_literal()
61+
elif self._current == '|':
62+
yield self._match_or_else('|', 'or', 'pipe')
63+
elif self._current == '`':
64+
yield self._consume_literal()
65+
elif self._current in self.START_NUMBER:
66+
start = self._position
67+
buff = self._current
68+
while self._next() in self.VALID_NUMBER:
69+
buff += self._current
70+
yield {'type': 'number', 'value': int(buff),
71+
'start': start, 'end': start + len(buff)}
72+
elif self._current == '"':
73+
yield self._consume_quoted_identifier()
74+
elif self._current == '<':
75+
yield self._match_or_else('=', 'lte', 'lt')
76+
elif self._current == '>':
77+
yield self._match_or_else('=', 'gte', 'gt')
78+
elif self._current == '!':
79+
yield self._match_or_else('=', 'ne', 'unknown')
80+
elif self._current == '=':
81+
yield self._match_or_else('=', 'eq', 'unknown')
82+
else:
83+
raise LexerError(lexer_position=self._position,
84+
lexer_value=self._current,
85+
message="Unknown token %s" % self._current)
86+
yield {'type': 'eof', 'value': '',
87+
'start': self._length, 'end': self._length}
88+
89+
def _initialize_for_expression(self, expression):
4490
if not expression:
4591
raise EmptyExpressionError()
46-
previous_column = 0
47-
for match in self.master_regex.finditer(expression):
48-
value = match.group()
49-
start = match.start()
50-
end = match.end()
51-
if match.lastgroup == 'skip':
52-
# Ignore whitespace.
53-
previous_column = end
54-
continue
55-
if start != previous_column:
56-
bad_value = expression[previous_column:start]
57-
# Try to give a good error message.
58-
if bad_value == '"':
59-
raise LexerError(
60-
lexer_position=previous_column,
61-
lexer_value=value,
62-
message='Starting quote is missing the ending quote',
63-
expression=expression)
64-
raise LexerError(lexer_position=previous_column,
65-
lexer_value=value,
66-
message='Unknown character',
67-
expression=expression)
68-
previous_column = end
69-
token_type = match.lastgroup
70-
handler = getattr(self, '_token_%s' % token_type.lower(), None)
71-
if handler is not None:
72-
value = handler(value, start, end)
73-
yield {'type': token_type, 'value': value,
74-
'start': start, 'end': end}
75-
# At the end of the loop make sure we've consumed all the input.
76-
# If we haven't then we have unidentified characters.
77-
if end != len(expression):
78-
msg = "Unknown characters at the end of the expression"
79-
raise LexerError(lexer_position=end,
80-
lexer_value='',
81-
message=msg, expression=expression)
92+
self._position = 0
93+
self._expression = expression
94+
self._chars = list(self._expression)
95+
self._current = self._chars[self._position]
96+
self._length = len(self._expression)
97+
98+
def _next(self):
99+
if self._position == self._length - 1:
100+
self._current = None
82101
else:
83-
yield {'type': 'eof', 'value': '',
84-
'start': len(expression), 'end': len(expression)}
102+
self._position += 1
103+
self._current = self._chars[self._position]
104+
return self._current
85105

86-
def _token_number(self, value, start, end):
87-
return int(value)
106+
def _consume_until(self, delimiter):
107+
# Consume until the delimiter is reached,
108+
# allowing for the delimiter to be escaped with "\".
109+
start = self._position
110+
buff = ''
111+
self._next()
112+
while self._current != delimiter:
113+
if self._current == '\\':
114+
buff += '\\'
115+
self._next()
116+
if self._current is None:
117+
raise LexerError(lexer_position=start,
118+
lexer_value=self._expression,
119+
message="Unclosed %s delimiter" % delimiter)
120+
buff += self._current
121+
self._next()
122+
# Skip the closing delimiter.
123+
self._next()
124+
return buff
88125

89-
def _token_quoted_identifier(self, value, start, end):
126+
def _consume_literal(self):
127+
start = self._position
128+
lexeme = self._consume_until('`')
129+
lexeme = lexeme.replace('\\`', '`')
90130
try:
91-
return loads(value)
92-
except ValueError as e:
93-
error_message = str(e).split(':')[0]
94-
raise LexerError(lexer_position=start,
95-
lexer_value=value,
96-
message=error_message)
97-
98-
def _token_string_literal(self, value, start, end):
99-
return value[1:-1]
100-
101-
def _token_literal(self, value, start, end):
102-
actual_value = value[1:-1]
103-
actual_value = actual_value.replace('\\`', '`').lstrip()
104-
# First, if it looks like JSON then we parse it as
105-
# JSON and any json parsing errors propogate as lexing
106-
# errors.
107-
if self._looks_like_json(actual_value):
108-
try:
109-
return loads(actual_value)
110-
except ValueError:
111-
raise LexerError(lexer_position=start,
112-
lexer_value=value,
113-
message="Bad token %s" % value)
114-
else:
115-
potential_value = '"%s"' % actual_value
131+
# Assume it is valid JSON and attempt to parse.
132+
parsed_json = loads(lexeme)
133+
except ValueError:
116134
try:
117-
# There's a shortcut syntax where string literals
118-
# don't have to be quoted. This is only true if the
119-
# string doesn't start with chars that could start a valid
120-
# JSON value.
121-
value = loads(potential_value)
135+
# Invalid JSON values should be converted to quoted
136+
# JSON strings during the JEP-12 deprecation period.
137+
parsed_json = loads('"%s"' % lexeme.lstrip())
122138
warnings.warn("deprecated string literal syntax",
123139
PendingDeprecationWarning)
124-
return value
125140
except ValueError:
126141
raise LexerError(lexer_position=start,
127-
lexer_value=value,
128-
message="Bad token %s" % value)
142+
lexer_value=self._expression,
143+
message="Bad token %s" % lexeme)
144+
token_len = self._position - start
145+
return {'type': 'literal', 'value': parsed_json,
146+
'start': start, 'end': token_len}
129147

130-
def _looks_like_json(self, value):
131-
# Figure out if the string "value" starts with something
132-
# that looks like json.
133-
if not value:
134-
return False
135-
elif value[0] in ['"', '{', '[']:
136-
return True
137-
elif value in ['true', 'false', 'null']:
138-
return True
139-
elif value[0] in ['-', '0', '1', '2', '3', '4', '5',
140-
'6', '7', '8', '9']:
141-
# Then this is JSON, return True.
142-
try:
143-
loads(value)
144-
return True
145-
except ValueError:
146-
return False
147-
else:
148-
return False
148+
def _consume_quoted_identifier(self):
149+
start = self._position
150+
lexeme = '"' + self._consume_until('"') + '"'
151+
try:
152+
token_len = self._position - start
153+
return {'type': 'quoted_identifier', 'value': loads(lexeme),
154+
'start': start, 'end': token_len}
155+
except ValueError as e:
156+
error_message = str(e).split(':')[0]
157+
raise LexerError(lexer_position=start,
158+
lexer_value=lexeme,
159+
message=error_message)
160+
161+
def _consume_raw_string_literal(self):
162+
start = self._position
163+
lexeme = self._consume_until("'")
164+
token_len = self._position - start
165+
return {'type': 'literal', 'value': lexeme,
166+
'start': start, 'end': token_len}
167+
168+
def _match_or_else(self, expected, match_type, else_type):
169+
start = self._position
170+
current = self._current
171+
next_char = self._next()
172+
if next_char == expected:
173+
self._next()
174+
return {'type': match_type, 'value': current + next_char,
175+
'start': start, 'end': start + 1}
176+
return {'type': else_type, 'value': current,
177+
'start': start, 'end': start}

tests/test_lexer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,17 @@ def test_position_multiple_tokens(self):
132132
]
133133
)
134134

135+
def test_adds_quotes_when_invalid_json(self):
136+
tokens = list(self.lexer.tokenize('`{{}`'))
137+
self.assertEqual(
138+
tokens,
139+
[{'type': 'literal', 'value': '{{}',
140+
'start': 0, 'end': 4},
141+
{'type': 'eof', 'value': '',
142+
'start': 5, 'end': 5}
143+
]
144+
)
145+
135146
def test_unknown_character(self):
136147
with self.assertRaises(LexerError):
137148
tokens = list(self.lexer.tokenize('foo[0^]'))

tests/test_parser.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,12 @@ def test_incomplete_expression_with_missing_paren(self):
144144
def test_bad_lexer_values(self):
145145
error_message = (
146146
'Bad jmespath expression: '
147-
'Starting quote is missing the ending quote:\n'
147+
'Unclosed " delimiter:\n'
148148
'foo."bar\n'
149149
' ^')
150150
self.assert_error_message('foo."bar', error_message,
151151
exception=exceptions.LexerError)
152152

153-
def test_bad_lexer_literal_value_with_json_object(self):
154-
error_message = ('Bad jmespath expression: '
155-
'Bad token `{{}`:\n`{{}`\n^')
156-
self.assert_error_message('`{{}`', error_message,
157-
exception=exceptions.LexerError)
158-
159153
def test_bad_unicode_string(self):
160154
# This error message is straight from the JSON parser
161155
# and pypy has a slightly different error message,

0 commit comments

Comments
 (0)