Skip to content

Commit 7a98e54

Browse files
committed
test(usearch): add comprehensive persistence and auto-rebuild tests
Add test suite covering new save-on-close and auto-rebuild features: - test_usearch_index_save_on_close: verify persistence on close() - test_usearch_index_flush_method: verify explicit flush() works - test_usearch_index_auto_rebuild_on_corrupted_file: recovery test - test_usearch_index_auto_rebuild_on_count_mismatch: sync detection - test_usearch_index_metadata_tracking: vector count tracking - test_usearch_index_rebuild_with_no_vectors: edge case handling - test_usearch_index_no_save_on_add: verify no immediate saves
1 parent ef07989 commit 7a98e54

File tree

1 file changed

+214
-0
lines changed

1 file changed

+214
-0
lines changed
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
"""
2+
Tests for UsearchIndex persistence features: save-on-close and auto-rebuild.
3+
"""
4+
5+
import iscc_core as ic
6+
from iscc_search.indexes.usearch.index import UsearchIndex
7+
from iscc_search.schema import IsccAsset
8+
9+
10+
def test_usearch_index_save_on_close(tmp_path, sample_iscc_ids):
11+
"""Test that NphdIndex files are saved on close() and loaded correctly."""
12+
index_path = tmp_path / "save_on_close"
13+
14+
# Create index and add assets
15+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
16+
content_unit = ic.gen_text_code_v0("Test content for save on close")["iscc"]
17+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
18+
asset = IsccAsset(
19+
iscc_id=sample_iscc_ids[0],
20+
units=[instance_unit, content_unit],
21+
)
22+
idx.add_assets([asset])
23+
24+
# Close index (should save NphdIndex files)
25+
idx.close()
26+
27+
# Verify .usearch file exists
28+
usearch_file = index_path / "CONTENT_TEXT_V0.usearch"
29+
assert usearch_file.exists(), "NphdIndex file should exist after close()"
30+
31+
# Reopen index and verify data persists
32+
idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
33+
query = IsccAsset(units=[instance_unit, content_unit])
34+
result = idx2.search_assets(query, limit=10)
35+
36+
assert len(result.matches) == 1
37+
assert result.matches[0].iscc_id == sample_iscc_ids[0]
38+
39+
idx2.close()
40+
41+
42+
def test_usearch_index_flush_method(tmp_path, sample_iscc_ids):
43+
"""Test explicit flush() saves NphdIndex files without closing."""
44+
index_path = tmp_path / "flush_test"
45+
46+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
47+
content_unit = ic.gen_text_code_v0("Test content for flush")["iscc"]
48+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
49+
asset = IsccAsset(
50+
iscc_id=sample_iscc_ids[0],
51+
units=[instance_unit, content_unit],
52+
)
53+
idx.add_assets([asset])
54+
55+
# Explicit flush (should save without closing)
56+
idx.flush()
57+
58+
# Verify .usearch file exists
59+
usearch_file = index_path / "CONTENT_TEXT_V0.usearch"
60+
assert usearch_file.exists(), "NphdIndex file should exist after flush()"
61+
62+
# Index should still be usable
63+
query = IsccAsset(units=[instance_unit, content_unit])
64+
result = idx.search_assets(query, limit=10)
65+
assert len(result.matches) == 1
66+
67+
idx.close()
68+
69+
70+
def test_usearch_index_auto_rebuild_on_corrupted_file(tmp_path, sample_iscc_ids):
71+
"""Test auto-rebuild when .usearch file exists but is corrupted."""
72+
index_path = tmp_path / "rebuild_corrupted"
73+
74+
# Create index and add assets (including one without units to cover that branch)
75+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
76+
content_unit = ic.gen_text_code_v0("Test content for rebuild corrupted")["iscc"]
77+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
78+
79+
# Add normal asset
80+
asset1 = IsccAsset(
81+
iscc_id=sample_iscc_ids[0],
82+
units=[instance_unit, content_unit],
83+
)
84+
idx.add_assets([asset1])
85+
idx.close()
86+
87+
# Corrupt .usearch file by writing garbage
88+
usearch_file = index_path / "CONTENT_TEXT_V0.usearch"
89+
with open(usearch_file, "wb") as f:
90+
f.write(b"corrupted data")
91+
92+
# Reopen index - should detect corruption and auto-rebuild from LMDB
93+
idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
94+
95+
# Verify data is recovered via rebuild
96+
query = IsccAsset(units=[instance_unit, content_unit])
97+
result = idx2.search_assets(query, limit=10)
98+
assert len(result.matches) == 1
99+
assert result.matches[0].iscc_id == sample_iscc_ids[0]
100+
101+
idx2.close()
102+
103+
104+
def test_usearch_index_auto_rebuild_on_count_mismatch(tmp_path, sample_iscc_ids):
105+
"""Test auto-rebuild when vector count doesn't match metadata."""
106+
index_path = tmp_path / "rebuild_mismatch"
107+
108+
# Create index and add assets
109+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
110+
content_unit_1 = ic.gen_text_code_v0("Content 1 for count mismatch")["iscc"]
111+
content_unit_2 = ic.gen_text_code_v0("Content 2 for count mismatch")["iscc"]
112+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
113+
114+
asset1 = IsccAsset(iscc_id=sample_iscc_ids[0], units=[instance_unit, content_unit_1])
115+
asset2 = IsccAsset(iscc_id=sample_iscc_ids[1], units=[instance_unit, content_unit_2])
116+
117+
idx.add_assets([asset1, asset2])
118+
idx.close()
119+
120+
# Simulate out-of-sync by manually corrupting metadata AFTER close
121+
# Open just to update metadata, don't load indexes
122+
import lmdb
123+
import struct
124+
125+
env = lmdb.open(str(index_path / "index.lmdb"), subdir=False, max_dbs=3)
126+
with env.begin(write=True) as txn:
127+
metadata_db = env.open_db(b"__metadata__", txn=txn)
128+
key = b"nphd_count:CONTENT_TEXT_V0"
129+
txn.put(key, struct.pack(">Q", 999), db=metadata_db)
130+
env.close()
131+
132+
# Reopen - should detect mismatch (999 != 2) and rebuild
133+
idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
134+
135+
# Verify both assets are found (rebuild worked)
136+
result = idx2.search_assets(IsccAsset(units=[instance_unit, content_unit_1, content_unit_2]), limit=10)
137+
assert len(result.matches) == 2
138+
139+
idx2.close()
140+
141+
142+
def test_usearch_index_metadata_tracking(tmp_path, sample_iscc_ids):
143+
"""Test that vector count metadata is tracked correctly."""
144+
index_path = tmp_path / "metadata_tracking"
145+
146+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
147+
148+
# Initially, no metadata
149+
assert idx._get_nphd_metadata("CONTENT_TEXT_V0") is None
150+
151+
# Add asset
152+
content_unit = ic.gen_text_code_v0("Test for metadata tracking")["iscc"]
153+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
154+
asset = IsccAsset(iscc_id=sample_iscc_ids[0], units=[instance_unit, content_unit])
155+
idx.add_assets([asset])
156+
157+
# Metadata should be updated
158+
count = idx._get_nphd_metadata("CONTENT_TEXT_V0")
159+
assert count == 1
160+
161+
# Add another asset
162+
content_unit2 = ic.gen_text_code_v0("Another test for metadata tracking")["iscc"]
163+
asset2 = IsccAsset(iscc_id=sample_iscc_ids[1], units=[instance_unit, content_unit2])
164+
idx.add_assets([asset2])
165+
166+
# Metadata should be updated again
167+
count = idx._get_nphd_metadata("CONTENT_TEXT_V0")
168+
assert count == 2
169+
170+
idx.close()
171+
172+
173+
def test_usearch_index_rebuild_with_no_vectors(tmp_path, sample_iscc_ids):
174+
"""Test rebuild handles case where no vectors exist for unit_type."""
175+
index_path = tmp_path / "rebuild_no_vectors"
176+
177+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
178+
179+
# Add at least one asset to initialize the database
180+
content_unit = ic.gen_text_code_v0("Test for rebuild with no vectors")["iscc"]
181+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
182+
asset = IsccAsset(iscc_id=sample_iscc_ids[0], units=[instance_unit, content_unit])
183+
idx.add_assets([asset])
184+
185+
# Trigger rebuild for non-existent unit_type
186+
idx._rebuild_nphd_index("NONEXISTENT_TYPE")
187+
188+
# Should complete without error, no index created
189+
assert "NONEXISTENT_TYPE" not in idx._nphd_indexes
190+
191+
idx.close()
192+
193+
194+
def test_usearch_index_no_save_on_add(tmp_path, sample_iscc_ids):
195+
"""Test that add_assets does NOT immediately save to disk."""
196+
index_path = tmp_path / "no_save_on_add"
197+
198+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
199+
content_unit = ic.gen_text_code_v0("Test no save on add")["iscc"]
200+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
201+
asset = IsccAsset(iscc_id=sample_iscc_ids[0], units=[instance_unit, content_unit])
202+
203+
# Add asset but don't close
204+
idx.add_assets([asset])
205+
206+
# .usearch file should NOT exist yet (save-on-close only)
207+
usearch_file = index_path / "CONTENT_TEXT_V0.usearch"
208+
assert not usearch_file.exists(), "NphdIndex file should not exist before close()"
209+
210+
# Close to save
211+
idx.close()
212+
213+
# Now file should exist
214+
assert usearch_file.exists(), "NphdIndex file should exist after close()"

0 commit comments

Comments
 (0)