Skip to content

Commit 1f0ad9f

Browse files
committed
Lexer cleanup and test fixes
1 parent 4c87771 commit 1f0ad9f

File tree

3 files changed

+67
-59
lines changed

3 files changed

+67
-59
lines changed

jmespath/lexer.py

Lines changed: 55 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,6 @@
55
from jmespath.exceptions import LexerError, EmptyExpressionError
66

77

8-
START_IDENTIFIER = set(string.ascii_letters + '_')
9-
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
10-
START_NUMBER = set(string.digits + '-')
11-
VALID_NUMBER = set(string.digits)
12-
WHITESPACE = set(" \t\n\r")
13-
SIMPLE_TOKENS = {
14-
'.': 'dot',
15-
'*': 'star',
16-
']': 'rbracket',
17-
',': 'comma',
18-
':': 'colon',
19-
'@': 'current',
20-
'&': 'expref',
21-
'(': 'lparen',
22-
')': 'rparen',
23-
'{': 'lbrace',
24-
'}': 'rbrace'
25-
}
26-
27-
288
class Scanner(object):
299
def __init__(self, expression):
3010
if not expression:
@@ -45,68 +25,87 @@ def next(self):
4525

4626
def in_delimiter(self, delimiter):
4727
start = self.pos
48-
buffer = ''
28+
buff = ''
4929
self.next()
5030
while self.current != delimiter:
5131
if self.current == '\\':
52-
buffer += '\\'
32+
buff += '\\'
5333
self.next()
5434
if self.current is None:
55-
print(buffer)
5635
raise LexerError(lexer_position=start,
5736
lexer_value=self.expression,
58-
message="Unclosed delimiter: %s" % buffer)
59-
buffer += self.current
37+
message="Unclosed %s delimiter" % delimiter)
38+
buff += self.current
6039
self.next()
40+
# Skip the closing delimiter.
6141
self.next()
62-
return buffer
42+
return buff
6343

6444

6545
class Lexer(object):
46+
START_IDENTIFIER = set(string.ascii_letters + '_')
47+
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
48+
START_NUMBER = set(string.digits + '-')
49+
VALID_NUMBER = set(string.digits)
50+
WHITESPACE = set(" \t\n\r")
51+
SIMPLE_TOKENS = {
52+
'.': 'dot',
53+
'*': 'star',
54+
']': 'rbracket',
55+
',': 'comma',
56+
':': 'colon',
57+
'@': 'current',
58+
'&': 'expref',
59+
'(': 'lparen',
60+
')': 'rparen',
61+
'{': 'lbrace',
62+
'}': 'rbrace'
63+
}
64+
6665
def tokenize(self, expression):
6766
scanner = Scanner(expression)
6867
while scanner.current is not None:
69-
if scanner.current in SIMPLE_TOKENS:
70-
yield {'type': SIMPLE_TOKENS[scanner.current],
68+
if scanner.current in self.SIMPLE_TOKENS:
69+
yield {'type': self.SIMPLE_TOKENS[scanner.current],
7170
'value': scanner.current,
72-
'start': scanner.pos, 'end': scanner.pos}
71+
'start': scanner.pos, 'end': scanner.pos + 1}
7372
scanner.next()
74-
elif scanner.current in START_IDENTIFIER:
73+
elif scanner.current in self.START_IDENTIFIER:
7574
start = scanner.pos
76-
buffer = scanner.current
77-
while scanner.next() in VALID_IDENTIFIER:
78-
buffer += scanner.current
79-
yield {'type': 'unquoted_identifier', 'value': buffer,
80-
'start': start, 'end': len(buffer)}
81-
elif scanner.current in WHITESPACE:
75+
buff = scanner.current
76+
while scanner.next() in self.VALID_IDENTIFIER:
77+
buff += scanner.current
78+
yield {'type': 'unquoted_identifier', 'value': buff,
79+
'start': start, 'end': start + len(buff)}
80+
elif scanner.current in self.WHITESPACE:
8281
scanner.next()
8382
elif scanner.current == '[':
8483
start = scanner.pos
8584
next_char = scanner.next()
8685
if next_char == ']':
8786
scanner.next()
8887
yield {'type': 'flatten', 'value': '[]',
89-
'start': start, 'end': start + 1}
88+
'start': start, 'end': start + 2}
9089
elif next_char == '?':
9190
scanner.next()
9291
yield {'type': 'filter', 'value': '[?',
93-
'start': start, 'end': start + 1}
92+
'start': start, 'end': start + 2}
9493
else:
9594
yield {'type': 'lbracket', 'value': '[',
96-
'start': start, 'end': start}
95+
'start': start, 'end': start + 1}
9796
elif scanner.current == "'":
9897
yield self._consume_raw_string_literal(scanner)
9998
elif scanner.current == '|':
10099
yield self._match_or_else(scanner, '|', 'or', 'pipe')
101100
elif scanner.current == '`':
102101
yield self._consume_literal(scanner)
103-
elif scanner.current in START_NUMBER:
102+
elif scanner.current in self.START_NUMBER:
104103
start = scanner.pos
105-
buffer = scanner.current
106-
while scanner.next() in VALID_NUMBER:
107-
buffer += scanner.current
108-
yield {'type': 'number', 'value': int(buffer),
109-
'start': start, 'end': len(buffer)}
104+
buff = scanner.current
105+
while scanner.next() in self.VALID_NUMBER:
106+
buff += scanner.current
107+
yield {'type': 'number', 'value': int(buff),
108+
'start': start, 'end': start + len(buff)}
110109
elif scanner.current == '"':
111110
yield self._consume_quoted_identifier(scanner)
112111
elif scanner.current == '<':
@@ -118,15 +117,16 @@ def tokenize(self, expression):
118117
elif scanner.current == '=':
119118
yield self._match_or_else(scanner, '=', 'eq', 'unknown')
120119
else:
121-
yield {'type': 'unknown', 'value': scanner.current,
122-
'start': scanner.pos, 'end': scanner.pos}
123-
scanner.next()
120+
raise LexerError(lexer_position=scanner.pos,
121+
lexer_value=scanner.current,
122+
message="Unknown token %s" % scanner.current)
124123
yield {'type': 'eof', 'value': '',
125124
'start': len(expression), 'end': len(expression)}
126125

127126
def _consume_literal(self, scanner):
128127
start = scanner.pos
129128
lexeme = scanner.in_delimiter('`')
129+
lexeme = lexeme.replace('\\`', '`')
130130
try:
131131
# Assume it is valid JSON and attempt to parse.
132132
parsed_json = loads(lexeme)
@@ -141,15 +141,17 @@ def _consume_literal(self, scanner):
141141
raise LexerError(lexer_position=start,
142142
lexer_value=lexeme,
143143
message="Bad token %s" % lexeme)
144+
token_len = scanner.pos - start
144145
return {'type': 'literal', 'value': parsed_json,
145-
'start': start, 'end': len(lexeme)}
146+
'start': start, 'end': token_len}
146147

147148
def _consume_quoted_identifier(self, scanner):
148149
start = scanner.pos
149-
lexeme = scanner.in_delimiter('"')
150+
lexeme = '"' + scanner.in_delimiter('"') + '"'
150151
try:
152+
token_len = scanner.pos - start
151153
return {'type': 'quoted_identifier', 'value': loads(lexeme),
152-
'start': start, 'end': len(lexeme)}
154+
'start': start, 'end': token_len}
153155
except ValueError as e:
154156
error_message = str(e).split(':')[0]
155157
raise LexerError(lexer_position=start,
@@ -159,8 +161,9 @@ def _consume_quoted_identifier(self, scanner):
159161
def _consume_raw_string_literal(self, scanner):
160162
start = scanner.pos
161163
lexeme = scanner.in_delimiter("'")
164+
token_len = scanner.pos - start
162165
return {'type': 'literal', 'value': lexeme,
163-
'start': start, 'end': len(lexeme)}
166+
'start': start, 'end': token_len}
164167

165168
def _match_or_else(self, scanner, expected, match_type, else_type):
166169
start = scanner.pos

tests/test_lexer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,17 @@ def test_position_multiple_tokens(self):
132132
]
133133
)
134134

135+
def test_adds_quotes_when_invalid_json(self):
136+
tokens = list(self.lexer.tokenize('`{{}`'))
137+
self.assertEqual(
138+
tokens,
139+
[{'type': 'literal', 'value': '{{}',
140+
'start': 0, 'end': 4},
141+
{'type': 'eof', 'value': '',
142+
'start': 5, 'end': 5}
143+
]
144+
)
145+
135146
def test_unknown_character(self):
136147
with self.assertRaises(LexerError):
137148
tokens = list(self.lexer.tokenize('foo[0^]'))

tests/test_parser.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,12 @@ def test_incomplete_expression_with_missing_paren(self):
144144
def test_bad_lexer_values(self):
145145
error_message = (
146146
'Bad jmespath expression: '
147-
'Starting quote is missing the ending quote:\n'
147+
'Unclosed " delimiter:\n'
148148
'foo."bar\n'
149149
' ^')
150150
self.assert_error_message('foo."bar', error_message,
151151
exception=exceptions.LexerError)
152152

153-
def test_bad_lexer_literal_value_with_json_object(self):
154-
error_message = ('Bad jmespath expression: '
155-
'Bad token `{{}`:\n`{{}`\n^')
156-
self.assert_error_message('`{{}`', error_message,
157-
exception=exceptions.LexerError)
158-
159153
def test_bad_unicode_string(self):
160154
# This error message is straight from the JSON parser
161155
# and pypy has a slightly different error message,

0 commit comments

Comments
 (0)