-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtest_lexer.py
More file actions
340 lines (263 loc) · 10.4 KB
/
test_lexer.py
File metadata and controls
340 lines (263 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import pytest
from qql.exceptions import QQLSyntaxError
from qql.lexer import Lexer, TokenKind
def tokenize(q):
return Lexer().tokenize(q)
def kinds(q):
return [t.kind for t in tokenize(q)]
class TestKeywords:
def test_insert_keywords(self):
ks = kinds("INSERT INTO COLLECTION foo VALUES")
assert ks[:5] == [
TokenKind.INSERT,
TokenKind.INTO,
TokenKind.COLLECTION,
TokenKind.IDENTIFIER,
TokenKind.VALUES,
]
def test_keywords_case_insensitive(self):
ks = kinds("insert into collection foo values")
assert ks[0] == TokenKind.INSERT
assert ks[1] == TokenKind.INTO
def test_show_collections(self):
ks = kinds("SHOW COLLECTIONS")
assert ks[:2] == [TokenKind.SHOW, TokenKind.COLLECTIONS]
def test_search_keywords(self):
ks = kinds("SEARCH mycol SIMILAR TO 'hi' LIMIT 5")
assert ks[0] == TokenKind.SEARCH
assert ks[2] == TokenKind.SIMILAR
assert ks[3] == TokenKind.TO
assert ks[5] == TokenKind.LIMIT
def test_scroll_keywords(self):
ks = kinds("SCROLL FROM docs AFTER 'cursor-id' LIMIT 50")
assert ks[0] == TokenKind.SCROLL
assert ks[1] == TokenKind.FROM
assert TokenKind.AFTER in ks
assert TokenKind.LIMIT in ks
def test_select_keywords(self):
ks = kinds("SELECT * FROM notes WHERE id = 'abc'")
assert ks[0] == TokenKind.SELECT
assert ks[1] == TokenKind.STAR
assert ks[2] == TokenKind.FROM
assert ks[4] == TokenKind.WHERE
def test_delete_keywords(self):
ks = kinds("DELETE FROM foo WHERE id = 'abc'")
assert ks[:4] == [TokenKind.DELETE, TokenKind.FROM, TokenKind.IDENTIFIER, TokenKind.WHERE]
class TestLiterals:
def test_double_quoted_string(self):
tokens = tokenize('"hello world"')
assert tokens[0].kind == TokenKind.STRING
assert tokens[0].value == "hello world"
def test_single_quoted_string(self):
tokens = tokenize("'hello'")
assert tokens[0].kind == TokenKind.STRING
assert tokens[0].value == "hello"
def test_integer(self):
tokens = tokenize("42")
assert tokens[0].kind == TokenKind.INTEGER
assert tokens[0].value == "42"
def test_negative_integer(self):
tokens = tokenize("-7")
assert tokens[0].kind == TokenKind.INTEGER
assert tokens[0].value == "-7"
def test_float(self):
tokens = tokenize("3.14")
assert tokens[0].kind == TokenKind.FLOAT
assert tokens[0].value == "3.14"
def test_identifier(self):
tokens = tokenize("my_collection_1")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "my_collection_1"
class TestPunctuation:
def test_braces_colons_commas(self):
ks = kinds("{ 'a' : 1 , 'b' : 2 }")
assert TokenKind.LBRACE in ks
assert TokenKind.RBRACE in ks
assert TokenKind.COLON in ks
assert TokenKind.COMMA in ks
def test_brackets(self):
ks = kinds("[ 1, 2 ]")
assert ks[0] == TokenKind.LBRACKET
assert ks[-2] == TokenKind.RBRACKET
def test_star(self):
ks = kinds("*")
assert ks[0] == TokenKind.STAR
class TestErrors:
def test_unterminated_string(self):
with pytest.raises(QQLSyntaxError, match="Unterminated"):
tokenize('"not closed')
def test_unexpected_character(self):
with pytest.raises(QQLSyntaxError, match="Unexpected character"):
tokenize("@bad")
def test_error_includes_position(self):
with pytest.raises(QQLSyntaxError) as exc_info:
tokenize("abc @")
assert exc_info.value.pos is not None
class TestNewOperators:
def test_not_equals(self):
tokens = tokenize("field != 'x'")
assert tokens[1].kind == TokenKind.NOT_EQUALS
assert tokens[1].value == "!="
def test_gt(self):
tokens = tokenize("score > 0.5")
assert tokens[1].kind == TokenKind.GT
assert tokens[1].value == ">"
def test_gte(self):
tokens = tokenize("score >= 0.5")
assert tokens[1].kind == TokenKind.GTE
assert tokens[1].value == ">="
def test_lt(self):
tokens = tokenize("year < 2024")
assert tokens[1].kind == TokenKind.LT
assert tokens[1].value == "<"
def test_lte(self):
tokens = tokenize("year <= 2023")
assert tokens[1].kind == TokenKind.LTE
assert tokens[1].value == "<="
def test_lparen_rparen(self):
ks = kinds("(a OR b)")
assert TokenKind.LPAREN in ks
assert TokenKind.RPAREN in ks
def test_filter_keywords(self):
ks = kinds("AND OR NOT IN BETWEEN IS NULL EMPTY MATCH ANY PHRASE")
assert TokenKind.AND in ks
assert TokenKind.OR in ks
assert TokenKind.NOT in ks
assert TokenKind.IN in ks
assert TokenKind.BETWEEN in ks
assert TokenKind.IS in ks
assert TokenKind.NULL in ks
assert TokenKind.EMPTY in ks
assert TokenKind.MATCH in ks
assert TokenKind.ANY in ks
assert TokenKind.PHRASE in ks
def test_filter_keywords_case_insensitive(self):
ks = kinds("and or not in between is null empty match any phrase")
assert TokenKind.AND in ks
assert TokenKind.OR in ks
assert TokenKind.NOT in ks
def test_dotted_identifier(self):
tokens = tokenize("meta.source")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "meta.source"
def test_three_level_dotted_identifier(self):
tokens = tokenize("a.b.c")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "a.b.c"
def test_nested_array_path(self):
tokens = tokenize("country.cities[].population")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "country.cities[].population"
def test_gt_does_not_consume_equals_sign(self):
# ">" followed by non-"=" should be GT only
tokens = tokenize("a > b")
assert tokens[1].kind == TokenKind.GT
def test_bare_exclamation_raises(self):
with pytest.raises(QQLSyntaxError):
tokenize("field ! 'x'")
class TestEOF:
def test_ends_with_eof(self):
tokens = tokenize("hello")
assert tokens[-1].kind == TokenKind.EOF
class TestHybridKeyword:
def test_hybrid_keyword_uppercase(self):
ks = kinds("HYBRID")
assert ks[0] == TokenKind.HYBRID
def test_hybrid_keyword_lowercase(self):
ks = kinds("hybrid")
assert ks[0] == TokenKind.HYBRID
def test_dense_keyword(self):
ks = kinds("DENSE")
assert ks[0] == TokenKind.DENSE
def test_dense_keyword_lowercase(self):
ks = kinds("dense")
assert ks[0] == TokenKind.DENSE
def test_sparse_keyword(self):
ks = kinds("SPARSE")
assert ks[0] == TokenKind.SPARSE
def test_sparse_keyword_lowercase(self):
ks = kinds("sparse")
assert ks[0] == TokenKind.SPARSE
def test_fusion_keyword(self):
ks = kinds("FUSION")
assert ks[0] == TokenKind.FUSION
def test_hybrid_in_create_statement(self):
ks = kinds("CREATE COLLECTION articles HYBRID")
assert ks[3] == TokenKind.HYBRID
def test_hybrid_in_search_statement(self):
ks = kinds("SEARCH col SIMILAR TO 'q' LIMIT 5 USING HYBRID")
assert TokenKind.HYBRID in ks
def test_dense_as_identifier_in_dotted_path(self):
tokens = tokenize("dense.field")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "dense.field"
def test_sparse_as_identifier_in_dotted_path(self):
tokens = tokenize("sparse.value")
assert tokens[0].kind == TokenKind.IDENTIFIER
assert tokens[0].value == "sparse.value"
class TestRerankKeyword:
def test_rerank_keyword_uppercase(self):
ks = kinds("RERANK")
assert ks[0] == TokenKind.RERANK
def test_rerank_keyword_lowercase(self):
ks = kinds("rerank")
assert ks[0] == TokenKind.RERANK
def test_rerank_keyword_mixed_case(self):
ks = kinds("Rerank")
assert ks[0] == TokenKind.RERANK
def test_rerank_in_search_statement(self):
ks = kinds("SEARCH col SIMILAR TO 'q' LIMIT 5 RERANK")
assert TokenKind.RERANK in ks
def test_rerank_with_model_in_search(self):
ks = kinds("SEARCH col SIMILAR TO 'q' LIMIT 5 RERANK MODEL 'x'")
rerank_idx = ks.index(TokenKind.RERANK)
assert ks[rerank_idx + 1] == TokenKind.MODEL
assert ks[rerank_idx + 2] == TokenKind.STRING
class TestSearchParamKeywords:
def test_exact_keyword(self):
ks = kinds("EXACT")
assert ks[0] == TokenKind.EXACT
def test_with_keyword(self):
ks = kinds("WITH")
assert ks[0] == TokenKind.WITH
def test_acorn_keyword(self):
ks = kinds("ACORN")
assert ks[0] == TokenKind.ACORN
class TestUpdateGroupByKeywords:
def test_group_token(self):
ks = kinds("GROUP")
assert ks[0] == TokenKind.GROUP
def test_by_token(self):
ks = kinds("BY")
assert ks[0] == TokenKind.BY
def test_group_size_token(self):
ks = kinds("GROUP_SIZE")
assert ks[0] == TokenKind.GROUP_SIZE
def test_update_token(self):
ks = kinds("UPDATE")
assert ks[0] == TokenKind.UPDATE
def test_set_token(self):
ks = kinds("SET")
assert ks[0] == TokenKind.SET
def test_payload_token(self):
ks = kinds("PAYLOAD")
assert ks[0] == TokenKind.PAYLOAD
def test_group_by_sequence(self):
ks = kinds("GROUP BY category")
assert ks[0] == TokenKind.GROUP
assert ks[1] == TokenKind.BY
assert ks[2] == TokenKind.IDENTIFIER
def test_group_size_followed_by_integer(self):
ks = kinds("GROUP_SIZE 5")
assert ks[0] == TokenKind.GROUP_SIZE
assert ks[1] == TokenKind.INTEGER
def test_update_set_payload_sequence(self):
ks = kinds("UPDATE SET PAYLOAD")
assert ks[0] == TokenKind.UPDATE
assert ks[1] == TokenKind.SET
assert ks[2] == TokenKind.PAYLOAD
def test_update_set_vector_sequence(self):
ks = kinds("UPDATE SET VECTOR")
assert ks[0] == TokenKind.UPDATE
assert ks[1] == TokenKind.SET
assert ks[2] == TokenKind.VECTOR