|
1 |
| -import re |
| 1 | +import string |
2 | 2 | import warnings
|
3 | 3 | from json import loads
|
4 | 4 |
|
5 | 5 | from jmespath.exceptions import LexerError, EmptyExpressionError
|
6 | 6 |
|
7 | 7 |
|
8 | 8 | class Lexer(object):
|
9 |
| - TOKENS = ( |
10 |
| - r'(?P<number>-?\d+)|' |
11 |
| - r'(?P<unquoted_identifier>([a-zA-Z_][a-zA-Z_0-9]*))|' |
12 |
| - r'(?P<quoted_identifier>("(?:\\\\|\\"|[^"])*"))|' |
13 |
| - r'(?P<string_literal>(\'(?:\\\\|\\\'|[^\'])*\'))|' |
14 |
| - r'(?P<literal>(`(?:\\\\|\\`|[^`])*`))|' |
15 |
| - r'(?P<filter>\[\?)|' |
16 |
| - r'(?P<or>\|\|)|' |
17 |
| - r'(?P<pipe>\|)|' |
18 |
| - r'(?P<ne>!=)|' |
19 |
| - r'(?P<rbrace>\})|' |
20 |
| - r'(?P<eq>==)|' |
21 |
| - r'(?P<dot>\.)|' |
22 |
| - r'(?P<star>\*)|' |
23 |
| - r'(?P<gte>>=)|' |
24 |
| - r'(?P<lparen>\()|' |
25 |
| - r'(?P<lbrace>\{)|' |
26 |
| - r'(?P<lte><=)|' |
27 |
| - r'(?P<flatten>\[\])|' |
28 |
| - r'(?P<rbracket>\])|' |
29 |
| - r'(?P<lbracket>\[)|' |
30 |
| - r'(?P<rparen>\))|' |
31 |
| - r'(?P<comma>,)|' |
32 |
| - r'(?P<colon>:)|' |
33 |
| - r'(?P<lt><)|' |
34 |
| - r'(?P<expref>&)|' |
35 |
| - r'(?P<gt>>)|' |
36 |
| - r'(?P<current>@)|' |
37 |
| - r'(?P<skip>[ \t]+)' |
38 |
| - ) |
39 |
| - |
40 |
| - def __init__(self): |
41 |
| - self.master_regex = re.compile(self.TOKENS) |
| 9 | + START_IDENTIFIER = set(string.ascii_letters + '_') |
| 10 | + VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') |
| 11 | + START_NUMBER = set(string.digits + '-') |
| 12 | + VALID_NUMBER = set(string.digits) |
| 13 | + WHITESPACE = set(" \t\n\r") |
| 14 | + SIMPLE_TOKENS = { |
| 15 | + '.': 'dot', |
| 16 | + '*': 'star', |
| 17 | + ']': 'rbracket', |
| 18 | + ',': 'comma', |
| 19 | + ':': 'colon', |
| 20 | + '@': 'current', |
| 21 | + '&': 'expref', |
| 22 | + '(': 'lparen', |
| 23 | + ')': 'rparen', |
| 24 | + '{': 'lbrace', |
| 25 | + '}': 'rbrace' |
| 26 | + } |
42 | 27 |
|
43 | 28 | def tokenize(self, expression):
|
| 29 | + self._initialize_for_expression(expression) |
| 30 | + while self._current is not None: |
| 31 | + if self._current in self.SIMPLE_TOKENS: |
| 32 | + yield {'type': self.SIMPLE_TOKENS[self._current], |
| 33 | + 'value': self._current, |
| 34 | + 'start': self._position, 'end': self._position + 1} |
| 35 | + self._next() |
| 36 | + elif self._current in self.START_IDENTIFIER: |
| 37 | + start = self._position |
| 38 | + buff = self._current |
| 39 | + while self._next() in self.VALID_IDENTIFIER: |
| 40 | + buff += self._current |
| 41 | + yield {'type': 'unquoted_identifier', 'value': buff, |
| 42 | + 'start': start, 'end': start + len(buff)} |
| 43 | + elif self._current in self.WHITESPACE: |
| 44 | + self._next() |
| 45 | + elif self._current == '[': |
| 46 | + start = self._position |
| 47 | + next_char = self._next() |
| 48 | + if next_char == ']': |
| 49 | + self._next() |
| 50 | + yield {'type': 'flatten', 'value': '[]', |
| 51 | + 'start': start, 'end': start + 2} |
| 52 | + elif next_char == '?': |
| 53 | + self._next() |
| 54 | + yield {'type': 'filter', 'value': '[?', |
| 55 | + 'start': start, 'end': start + 2} |
| 56 | + else: |
| 57 | + yield {'type': 'lbracket', 'value': '[', |
| 58 | + 'start': start, 'end': start + 1} |
| 59 | + elif self._current == "'": |
| 60 | + yield self._consume_raw_string_literal() |
| 61 | + elif self._current == '|': |
| 62 | + yield self._match_or_else('|', 'or', 'pipe') |
| 63 | + elif self._current == '`': |
| 64 | + yield self._consume_literal() |
| 65 | + elif self._current in self.START_NUMBER: |
| 66 | + start = self._position |
| 67 | + buff = self._current |
| 68 | + while self._next() in self.VALID_NUMBER: |
| 69 | + buff += self._current |
| 70 | + yield {'type': 'number', 'value': int(buff), |
| 71 | + 'start': start, 'end': start + len(buff)} |
| 72 | + elif self._current == '"': |
| 73 | + yield self._consume_quoted_identifier() |
| 74 | + elif self._current == '<': |
| 75 | + yield self._match_or_else('=', 'lte', 'lt') |
| 76 | + elif self._current == '>': |
| 77 | + yield self._match_or_else('=', 'gte', 'gt') |
| 78 | + elif self._current == '!': |
| 79 | + yield self._match_or_else('=', 'ne', 'unknown') |
| 80 | + elif self._current == '=': |
| 81 | + yield self._match_or_else('=', 'eq', 'unknown') |
| 82 | + else: |
| 83 | + raise LexerError(lexer_position=self._position, |
| 84 | + lexer_value=self._current, |
| 85 | + message="Unknown token %s" % self._current) |
| 86 | + yield {'type': 'eof', 'value': '', |
| 87 | + 'start': self._length, 'end': self._length} |
| 88 | + |
| 89 | + def _initialize_for_expression(self, expression): |
44 | 90 | if not expression:
|
45 | 91 | raise EmptyExpressionError()
|
46 |
| - previous_column = 0 |
47 |
| - for match in self.master_regex.finditer(expression): |
48 |
| - value = match.group() |
49 |
| - start = match.start() |
50 |
| - end = match.end() |
51 |
| - if match.lastgroup == 'skip': |
52 |
| - # Ignore whitespace. |
53 |
| - previous_column = end |
54 |
| - continue |
55 |
| - if start != previous_column: |
56 |
| - bad_value = expression[previous_column:start] |
57 |
| - # Try to give a good error message. |
58 |
| - if bad_value == '"': |
59 |
| - raise LexerError( |
60 |
| - lexer_position=previous_column, |
61 |
| - lexer_value=value, |
62 |
| - message='Starting quote is missing the ending quote', |
63 |
| - expression=expression) |
64 |
| - raise LexerError(lexer_position=previous_column, |
65 |
| - lexer_value=value, |
66 |
| - message='Unknown character', |
67 |
| - expression=expression) |
68 |
| - previous_column = end |
69 |
| - token_type = match.lastgroup |
70 |
| - handler = getattr(self, '_token_%s' % token_type.lower(), None) |
71 |
| - if handler is not None: |
72 |
| - value = handler(value, start, end) |
73 |
| - yield {'type': token_type, 'value': value, |
74 |
| - 'start': start, 'end': end} |
75 |
| - # At the end of the loop make sure we've consumed all the input. |
76 |
| - # If we haven't then we have unidentified characters. |
77 |
| - if end != len(expression): |
78 |
| - msg = "Unknown characters at the end of the expression" |
79 |
| - raise LexerError(lexer_position=end, |
80 |
| - lexer_value='', |
81 |
| - message=msg, expression=expression) |
| 92 | + self._position = 0 |
| 93 | + self._expression = expression |
| 94 | + self._chars = list(self._expression) |
| 95 | + self._current = self._chars[self._position] |
| 96 | + self._length = len(self._expression) |
| 97 | + |
| 98 | + def _next(self): |
| 99 | + if self._position == self._length - 1: |
| 100 | + self._current = None |
82 | 101 | else:
|
83 |
| - yield {'type': 'eof', 'value': '', |
84 |
| - 'start': len(expression), 'end': len(expression)} |
| 102 | + self._position += 1 |
| 103 | + self._current = self._chars[self._position] |
| 104 | + return self._current |
85 | 105 |
|
86 |
| - def _token_number(self, value, start, end): |
87 |
| - return int(value) |
| 106 | + def _consume_until(self, delimiter): |
| 107 | + # Consume until the delimiter is reached, |
| 108 | + # allowing for the delimiter to be escaped with "\". |
| 109 | + start = self._position |
| 110 | + buff = '' |
| 111 | + self._next() |
| 112 | + while self._current != delimiter: |
| 113 | + if self._current == '\\': |
| 114 | + buff += '\\' |
| 115 | + self._next() |
| 116 | + if self._current is None: |
| 117 | + raise LexerError(lexer_position=start, |
| 118 | + lexer_value=self._expression, |
| 119 | + message="Unclosed %s delimiter" % delimiter) |
| 120 | + buff += self._current |
| 121 | + self._next() |
| 122 | + # Skip the closing delimiter. |
| 123 | + self._next() |
| 124 | + return buff |
88 | 125 |
|
89 |
| - def _token_quoted_identifier(self, value, start, end): |
| 126 | + def _consume_literal(self): |
| 127 | + start = self._position |
| 128 | + lexeme = self._consume_until('`') |
| 129 | + lexeme = lexeme.replace('\\`', '`') |
90 | 130 | try:
|
91 |
| - return loads(value) |
92 |
| - except ValueError as e: |
93 |
| - error_message = str(e).split(':')[0] |
94 |
| - raise LexerError(lexer_position=start, |
95 |
| - lexer_value=value, |
96 |
| - message=error_message) |
97 |
| - |
98 |
| - def _token_string_literal(self, value, start, end): |
99 |
| - return value[1:-1] |
100 |
| - |
101 |
| - def _token_literal(self, value, start, end): |
102 |
| - actual_value = value[1:-1] |
103 |
| - actual_value = actual_value.replace('\\`', '`').lstrip() |
104 |
| - # First, if it looks like JSON then we parse it as |
105 |
| - # JSON and any json parsing errors propogate as lexing |
106 |
| - # errors. |
107 |
| - if self._looks_like_json(actual_value): |
108 |
| - try: |
109 |
| - return loads(actual_value) |
110 |
| - except ValueError: |
111 |
| - raise LexerError(lexer_position=start, |
112 |
| - lexer_value=value, |
113 |
| - message="Bad token %s" % value) |
114 |
| - else: |
115 |
| - potential_value = '"%s"' % actual_value |
| 131 | + # Assume it is valid JSON and attempt to parse. |
| 132 | + parsed_json = loads(lexeme) |
| 133 | + except ValueError: |
116 | 134 | try:
|
117 |
| - # There's a shortcut syntax where string literals |
118 |
| - # don't have to be quoted. This is only true if the |
119 |
| - # string doesn't start with chars that could start a valid |
120 |
| - # JSON value. |
121 |
| - value = loads(potential_value) |
| 135 | + # Invalid JSON values should be converted to quoted |
| 136 | + # JSON strings during the JEP-12 deprecation period. |
| 137 | + parsed_json = loads('"%s"' % lexeme.lstrip()) |
122 | 138 | warnings.warn("deprecated string literal syntax",
|
123 | 139 | PendingDeprecationWarning)
|
124 |
| - return value |
125 | 140 | except ValueError:
|
126 | 141 | raise LexerError(lexer_position=start,
|
127 |
| - lexer_value=value, |
128 |
| - message="Bad token %s" % value) |
| 142 | + lexer_value=self._expression, |
| 143 | + message="Bad token %s" % lexeme) |
| 144 | + token_len = self._position - start |
| 145 | + return {'type': 'literal', 'value': parsed_json, |
| 146 | + 'start': start, 'end': token_len} |
129 | 147 |
|
130 |
| - def _looks_like_json(self, value): |
131 |
| - # Figure out if the string "value" starts with something |
132 |
| - # that looks like json. |
133 |
| - if not value: |
134 |
| - return False |
135 |
| - elif value[0] in ['"', '{', '[']: |
136 |
| - return True |
137 |
| - elif value in ['true', 'false', 'null']: |
138 |
| - return True |
139 |
| - elif value[0] in ['-', '0', '1', '2', '3', '4', '5', |
140 |
| - '6', '7', '8', '9']: |
141 |
| - # Then this is JSON, return True. |
142 |
| - try: |
143 |
| - loads(value) |
144 |
| - return True |
145 |
| - except ValueError: |
146 |
| - return False |
147 |
| - else: |
148 |
| - return False |
| 148 | + def _consume_quoted_identifier(self): |
| 149 | + start = self._position |
| 150 | + lexeme = '"' + self._consume_until('"') + '"' |
| 151 | + try: |
| 152 | + token_len = self._position - start |
| 153 | + return {'type': 'quoted_identifier', 'value': loads(lexeme), |
| 154 | + 'start': start, 'end': token_len} |
| 155 | + except ValueError as e: |
| 156 | + error_message = str(e).split(':')[0] |
| 157 | + raise LexerError(lexer_position=start, |
| 158 | + lexer_value=lexeme, |
| 159 | + message=error_message) |
| 160 | + |
| 161 | + def _consume_raw_string_literal(self): |
| 162 | + start = self._position |
| 163 | + lexeme = self._consume_until("'") |
| 164 | + token_len = self._position - start |
| 165 | + return {'type': 'literal', 'value': lexeme, |
| 166 | + 'start': start, 'end': token_len} |
| 167 | + |
| 168 | + def _match_or_else(self, expected, match_type, else_type): |
| 169 | + start = self._position |
| 170 | + current = self._current |
| 171 | + next_char = self._next() |
| 172 | + if next_char == expected: |
| 173 | + self._next() |
| 174 | + return {'type': match_type, 'value': current + next_char, |
| 175 | + 'start': start, 'end': start + 1} |
| 176 | + return {'type': else_type, 'value': current, |
| 177 | + 'start': start, 'end': start} |
0 commit comments