-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtest_dumper.py
More file actions
265 lines (209 loc) · 10.8 KB
/
test_dumper.py
File metadata and controls
265 lines (209 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""Tests for the QQL collection dumper (src/qql/dumper.py)."""
from __future__ import annotations
import pytest
from rich.console import Console
from qql.dumper import (
_DEFAULT_DUMP_BATCH_SIZE,
_is_hybrid,
_serialize_value,
dump_collection,
)
# ── Helpers ───────────────────────────────────────────────────────────────────
def null_console() -> Console:
return Console(quiet=True)
def _make_record(mocker, payload: dict, point_id="rec-1"):
"""Create a mock Qdrant ScoredPoint / Record with the given payload."""
rec = mocker.MagicMock()
rec.payload = payload
rec.id = point_id
return rec
def _make_client(mocker, *, exists=True, hybrid=False, points=None, total=None):
"""Build a mock QdrantClient for dump tests.
*points* is a list of payload dicts. scroll() returns them all in one
batch when len(points) <= _DEFAULT_DUMP_BATCH_SIZE, else two batches.
"""
points = points or []
client = mocker.MagicMock()
client.collection_exists.return_value = exists
# get_collection — return hybrid or dense vector config
if hybrid:
client.get_collection.return_value.config.params.vectors = {"dense": object()}
else:
# non-dict → dense-only
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(
spec=[] # not a dict
)
# count
cnt = mocker.MagicMock()
cnt.count = total if total is not None else len(points)
client.count.return_value = cnt
# scroll — single-batch by default
records = [_make_record(mocker, p, f"id-{i}") for i, p in enumerate(points, 1)]
client.scroll.return_value = (records, None)
return client
# ── _serialize_value ──────────────────────────────────────────────────────────
class TestSerializeValue:
def test_string(self):
assert _serialize_value("hello world") == "'hello world'"
def test_string_escapes_single_quote(self):
assert _serialize_value("it's") == r"'it\'s'"
def test_string_escapes_backslash(self):
assert _serialize_value("a\\b") == "'a\\\\b'"
def test_int(self):
assert _serialize_value(42) == "42"
def test_negative_int(self):
assert _serialize_value(-7) == "-7"
def test_float(self):
result = _serialize_value(3.14)
assert "3.14" in result
def test_bool_true(self):
assert _serialize_value(True) == "true"
def test_bool_false(self):
assert _serialize_value(False) == "false"
def test_none(self):
assert _serialize_value(None) == "null"
def test_list(self):
assert _serialize_value([1, 2, 3]) == "[1, 2, 3]"
def test_nested_list_of_strings(self):
result = _serialize_value(["a", "b"])
assert result == "['a', 'b']"
def test_dict_produces_braces(self):
result = _serialize_value({"key": "val"})
assert "{" in result and "}" in result
assert "'key'" in result
assert "'val'" in result
# ── _is_hybrid ────────────────────────────────────────────────────────────────
class TestIsHybrid:
def test_dict_vectors_is_hybrid(self, mocker):
client = mocker.MagicMock()
client.get_collection.return_value.config.params.vectors = {"dense": object()}
assert _is_hybrid("col", client) is True
def test_scalar_vectors_is_not_hybrid(self, mocker):
client = mocker.MagicMock()
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(
spec=[]
)
assert _is_hybrid("col", client) is False
# ── dump_collection ───────────────────────────────────────────────────────────
class TestDumpCollection:
def test_creates_output_file(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "hello"}])
dump_collection("col", out, client, null_console(), null_console())
assert (tmp_path / "dump.qql").exists()
def test_writes_create_statement_dense(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "hello"}])
dump_collection("my_col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "CREATE COLLECTION my_col\n" in content
assert "HYBRID" not in content.split("CREATE")[1].split("\n")[0]
def test_writes_create_statement_hybrid(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, hybrid=True, points=[{"text": "hello"}])
dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "CREATE COLLECTION col HYBRID" in content
def test_hybrid_insert_bulk_has_using_hybrid(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, hybrid=True, points=[{"text": "hello"}])
dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "] USING HYBRID" in content
def test_dense_insert_bulk_has_no_using_clause(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "hello"}])
dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "USING HYBRID" not in content
def test_skips_points_without_text_field(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
points = [{"text": "ok"}, {"author": "no_text_here"}, {"text": "also ok"}]
client = _make_client(mocker, points=points)
written, skipped = dump_collection("col", out, client, null_console(), null_console())
assert written == 2
assert skipped == 1
content = (tmp_path / "dump.qql").read_text()
assert "no_text_here" not in content
def test_returns_zero_when_collection_missing(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, exists=False)
written, skipped = dump_collection("missing", out, client, null_console(), null_console())
assert written == 0
assert skipped == 0
def test_payload_values_serialized_correctly(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
payload = {"text": "hello", "year": 2024, "active": True, "score": 0.9}
client = _make_client(mocker, points=[payload])
dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "'year': 2024" in content
assert "'active': true" in content
assert "'score':" in content
def test_dump_preserves_point_id_in_insert_values(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "hello"}])
dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "'id': 'id-1'" in content
def test_batches_multiple_scroll_pages(self, tmp_path, mocker):
"""When scroll returns two pages, two INSERT BULK blocks should be written."""
out = str(tmp_path / "dump.qql")
client = mocker.MagicMock()
client.collection_exists.return_value = True
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
cnt = mocker.MagicMock()
cnt.count = _DEFAULT_DUMP_BATCH_SIZE + 1
client.count.return_value = cnt
batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(_DEFAULT_DUMP_BATCH_SIZE)]
batch2 = [_make_record(mocker, {"text": "last doc"}, "id-last")]
# First scroll call returns batch1 with a non-None offset; second returns batch2 + None
client.scroll.side_effect = [
(batch1, "some_offset"),
(batch2, None),
]
written, skipped = dump_collection("col", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert written == _DEFAULT_DUMP_BATCH_SIZE + 1
assert content.count("INSERT BULK") == 2
def test_header_contains_collection_name(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "x"}])
dump_collection("medical_records", out, client, null_console(), null_console())
content = (tmp_path / "dump.qql").read_text()
assert "medical_records" in content.split("QQL Dump")[1]
def test_output_file_created_in_nested_directory(self, tmp_path, mocker):
out = str(tmp_path / "sub" / "dir" / "dump.qql")
client = _make_client(mocker, points=[{"text": "x"}])
dump_collection("col", out, client, null_console(), null_console())
assert (tmp_path / "sub" / "dir" / "dump.qql").exists()
def test_custom_batch_size_splits_pages(self, tmp_path, mocker):
"""A batch_size of 2 over 3 points should produce two INSERT BULK blocks."""
out = str(tmp_path / "dump.qql")
client = mocker.MagicMock()
client.collection_exists.return_value = True
client.get_collection.return_value.config.params.vectors = mocker.MagicMock(spec=[])
cnt = mocker.MagicMock()
cnt.count = 3
client.count.return_value = cnt
batch1 = [_make_record(mocker, {"text": f"doc {i}"}, f"id-{i}") for i in range(2)]
batch2 = [_make_record(mocker, {"text": "last"}, "id-last")]
client.scroll.side_effect = [
(batch1, "offset-1"),
(batch2, None),
]
written, _ = dump_collection(
"col", out, client, null_console(), null_console(), batch_size=2
)
content = (tmp_path / "dump.qql").read_text()
assert written == 3
assert content.count("INSERT BULK") == 2
# client.scroll should have been called with limit=2
assert client.scroll.call_args_list[0].kwargs["limit"] == 2
def test_invalid_batch_size_raises(self, tmp_path, mocker):
out = str(tmp_path / "dump.qql")
client = _make_client(mocker, points=[{"text": "x"}])
with pytest.raises(ValueError):
dump_collection(
"col", out, client, null_console(), null_console(), batch_size=0
)