Skip to content

Commit 9895e60

Browse files
committed
feat(usearch): add crash recovery for missing .usearch files
Implement automatic detection and rebuilding of missing .usearch files from LMDB metadata on index startup. This handles crash scenarios where vectors were added to LMDB but .usearch files were never flushed to disk. - Add _get_all_tracked_unit_types() to scan metadata for tracked unit_types - Enhance _load_nphd_indexes() to detect orphaned metadata and trigger rebuilds - Add comprehensive test coverage for single and multiple missing file scenarios - Test edge cases including empty databases and natural loop exhaustion Resolves data loss scenarios during ungraceful shutdowns or crashes.
1 parent 7a98e54 commit 9895e60

File tree

2 files changed

+244
-0
lines changed

2 files changed

+244
-0
lines changed

iscc_search/indexes/usearch/index.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,16 +458,54 @@ def _get_nphd_metadata(self, unit_type):
458458
except lmdb.ReadonlyError: # pragma: no cover
459459
return None
460460

461+
def _get_all_tracked_unit_types(self):
462+
# type: () -> set[str]
463+
"""
464+
Get all unit_types tracked in LMDB metadata.
465+
466+
Scans metadata database for all 'nphd_count:*' keys and extracts unit_types.
467+
468+
:return: Set of unit_type identifiers that have been indexed
469+
"""
470+
unit_types = set() # type: set[str]
471+
prefix = b"nphd_count:"
472+
473+
try:
474+
with self.env.begin() as txn:
475+
metadata_db = self.env.open_db(b"__metadata__", txn=txn)
476+
cursor = txn.cursor(metadata_db)
477+
478+
# Seek to first key matching prefix
479+
if cursor.set_range(prefix):
480+
for key_bytes, _ in cursor:
481+
if not key_bytes.startswith(prefix):
482+
break
483+
# Extract unit_type from key (format: "nphd_count:UNIT_TYPE")
484+
unit_type = key_bytes[len(prefix) :].decode()
485+
unit_types.add(unit_type)
486+
487+
except lmdb.ReadonlyError: # pragma: no cover
488+
# Database doesn't exist yet (empty index)
489+
pass
490+
491+
return unit_types
492+
461493
def _load_nphd_indexes(self):
462494
# type: () -> None
463495
"""
464496
Load existing NphdIndex files with auto-rebuild on sync mismatch.
465497
466498
Compares actual vector count in .usearch file with expected count
467499
in LMDB metadata. Triggers full rebuild from LMDB if out of sync.
500+
Also rebuilds missing .usearch files for unit_types tracked in metadata
501+
(crash recovery for unflushed indexes).
468502
"""
503+
# Track which unit_types we've loaded from disk
504+
loaded_unit_types = set() # type: set[str]
505+
469506
for usearch_file in self.path.glob("*.usearch"):
470507
unit_type = usearch_file.stem # Filename without extension
508+
loaded_unit_types.add(unit_type)
471509
try:
472510
# Note: restore() gets max_dim from saved metadata, don't pass it
473511
nphd_index = NphdIndex.restore(str(usearch_file))
@@ -497,6 +535,19 @@ def _load_nphd_indexes(self):
497535
logger.warning(f"Failed to load NphdIndex '{usearch_file}': {e}. Rebuilding...")
498536
self._rebuild_nphd_index(unit_type)
499537

538+
# Check for orphaned metadata (tracked unit_types without .usearch files)
539+
# This handles crash recovery when vectors were added but never flushed
540+
tracked_unit_types = self._get_all_tracked_unit_types()
541+
missing_unit_types = tracked_unit_types - loaded_unit_types
542+
543+
if missing_unit_types:
544+
logger.warning(
545+
f"Found {len(missing_unit_types)} unit_type(s) in metadata without .usearch files: "
546+
f"{sorted(missing_unit_types)}. Rebuilding from LMDB (crash recovery)..."
547+
)
548+
for unit_type in missing_unit_types:
549+
self._rebuild_nphd_index(unit_type)
550+
500551
def _rebuild_nphd_index(self, unit_type):
501552
# type: (str) -> None
502553
"""

tests/test_indexes_usearch_persistence.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,196 @@ def test_usearch_index_no_save_on_add(tmp_path, sample_iscc_ids):
212212

213213
# Now file should exist
214214
assert usearch_file.exists(), "NphdIndex file should exist after close()"
215+
216+
217+
def test_usearch_index_crash_recovery_rebuild_missing_files(tmp_path, sample_iscc_ids):
218+
"""
219+
Test crash recovery: rebuild missing .usearch files from LMDB on startup.
220+
221+
Simulates a crash scenario where vectors were added but never flushed,
222+
leaving metadata in LMDB but no .usearch file on disk.
223+
"""
224+
index_path = tmp_path / "crash_recovery"
225+
226+
# Create index and add assets with multiple unit types
227+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
228+
content_unit = ic.gen_text_code_v0("Test content for crash recovery")["iscc"]
229+
data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
230+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
231+
232+
asset = IsccAsset(
233+
iscc_id=sample_iscc_ids[0],
234+
units=[instance_unit, content_unit, data_unit],
235+
)
236+
idx.add_assets([asset])
237+
238+
# Verify metadata exists but files don't
239+
assert idx._get_nphd_metadata("CONTENT_TEXT_V0") == 1
240+
assert idx._get_nphd_metadata("DATA_NONE_V0") == 1
241+
242+
content_file = index_path / "CONTENT_TEXT_V0.usearch"
243+
data_file = index_path / "DATA_NONE_V0.usearch"
244+
assert not content_file.exists(), "Files should not exist before close()"
245+
assert not data_file.exists(), "Files should not exist before close()"
246+
247+
# Simulate crash: close LMDB but DON'T save NphdIndex files
248+
# This leaves metadata in LMDB but no .usearch files on disk
249+
idx.env.close() # Direct env close, bypassing UsearchIndex.close()
250+
251+
# Verify files still don't exist (simulating crash before flush)
252+
assert not content_file.exists()
253+
assert not data_file.exists()
254+
255+
# Reopen index - should detect missing files and auto-rebuild from LMDB
256+
idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
257+
258+
# Verify files were created by auto-rebuild
259+
assert content_file.exists(), "Missing file should be rebuilt on startup"
260+
assert data_file.exists(), "Missing file should be rebuilt on startup"
261+
262+
# Verify data is accessible via search (proving rebuild worked)
263+
query = IsccAsset(units=[instance_unit, content_unit, data_unit])
264+
result = idx2.search_assets(query, limit=10)
265+
assert len(result.matches) == 1
266+
assert result.matches[0].iscc_id == sample_iscc_ids[0]
267+
268+
# Verify both unit types are searchable
269+
assert "CONTENT_TEXT_V0" in result.matches[0].matches
270+
assert "DATA_NONE_V0" in result.matches[0].matches
271+
272+
idx2.close()
273+
274+
275+
def test_usearch_index_get_all_tracked_unit_types(tmp_path, sample_iscc_ids):
276+
"""Test _get_all_tracked_unit_types correctly scans metadata."""
277+
index_path = tmp_path / "tracked_types"
278+
279+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
280+
281+
# Initially empty
282+
tracked = idx._get_all_tracked_unit_types()
283+
assert tracked == set()
284+
285+
# Add assets with different unit types
286+
content_unit = ic.gen_text_code_v0("Test content")["iscc"]
287+
data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
288+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
289+
290+
asset1 = IsccAsset(
291+
iscc_id=sample_iscc_ids[0],
292+
units=[instance_unit, content_unit],
293+
)
294+
asset2 = IsccAsset(
295+
iscc_id=sample_iscc_ids[1],
296+
units=[instance_unit, data_unit],
297+
)
298+
idx.add_assets([asset1, asset2])
299+
300+
# Check tracked types (INSTANCE is not similarity unit, shouldn't appear)
301+
tracked = idx._get_all_tracked_unit_types()
302+
assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0"}
303+
304+
# Test natural loop exhaustion: delete realm_id so nphd_count keys are last in DB
305+
# This tests the branch where the for loop completes naturally without breaking
306+
import struct
307+
308+
with idx.env.begin(write=True) as txn:
309+
metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
310+
txn.delete(b"realm_id", db=metadata_db)
311+
312+
# Should still find the tracked types (tests 480->491: loop exhausts naturally)
313+
tracked = idx._get_all_tracked_unit_types()
314+
assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0"}
315+
316+
# Test edge case 1: delete nphd_count keys but keep realm_id (add it back first)
317+
with idx.env.begin(write=True) as txn:
318+
metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
319+
# Add realm_id back
320+
txn.put(b"realm_id", struct.pack(">I", 0), db=metadata_db)
321+
322+
# Test edge case 2: delete nphd_count keys but keep realm_id
323+
# This tests the branch where set_range returns True but loop breaks immediately
324+
with idx.env.begin(write=True) as txn:
325+
metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
326+
cursor = txn.cursor(metadata_db)
327+
# Delete all nphd_count:* keys
328+
prefix = b"nphd_count:"
329+
if cursor.set_range(prefix):
330+
keys_to_delete = []
331+
for key_bytes, _ in cursor:
332+
if not key_bytes.startswith(prefix):
333+
break
334+
keys_to_delete.append(key_bytes)
335+
for key in keys_to_delete:
336+
txn.delete(key, db=metadata_db)
337+
338+
# Should return empty set (tests 480->491 branch: loop breaks on first non-matching key)
339+
tracked = idx._get_all_tracked_unit_types()
340+
assert tracked == set()
341+
342+
# Test edge case 2: delete realm_id too so set_range returns False
343+
with idx.env.begin(write=True) as txn:
344+
metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
345+
txn.delete(b"realm_id", db=metadata_db)
346+
347+
# Should return empty set (tests 479->491 branch: set_range returns False)
348+
tracked = idx._get_all_tracked_unit_types()
349+
assert tracked == set()
350+
351+
idx.close()
352+
353+
354+
def test_usearch_index_crash_recovery_multiple_missing_files(tmp_path, sample_iscc_ids):
355+
"""
356+
Test crash recovery with multiple missing .usearch files.
357+
358+
Verifies that all tracked unit_types are rebuilt when their files are missing.
359+
"""
360+
index_path = tmp_path / "multi_crash_recovery"
361+
362+
# Create index and add assets with three different similarity unit types
363+
idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
364+
365+
assets = []
366+
for i in range(3):
367+
content_unit = ic.gen_text_code_v0(f"Content {i}")["iscc"]
368+
data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
369+
meta_unit = ic.gen_meta_code_v0(f"Asset {i}")["iscc"]
370+
instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
371+
372+
assets.append(
373+
IsccAsset(
374+
iscc_id=sample_iscc_ids[i],
375+
units=[instance_unit, content_unit, data_unit, meta_unit],
376+
)
377+
)
378+
379+
idx.add_assets(assets)
380+
381+
# Verify metadata tracked for all three types
382+
tracked = idx._get_all_tracked_unit_types()
383+
assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0", "META_NONE_V0"}
384+
385+
# Simulate crash
386+
idx.env.close()
387+
388+
# Verify no .usearch files exist
389+
assert not (index_path / "CONTENT_TEXT_V0.usearch").exists()
390+
assert not (index_path / "DATA_NONE_V0.usearch").exists()
391+
assert not (index_path / "META_NONE_V0.usearch").exists()
392+
393+
# Reopen - should rebuild all three missing files
394+
idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
395+
396+
# Verify all files were created
397+
assert (index_path / "CONTENT_TEXT_V0.usearch").exists()
398+
assert (index_path / "DATA_NONE_V0.usearch").exists()
399+
assert (index_path / "META_NONE_V0.usearch").exists()
400+
401+
# Verify all three assets are searchable
402+
for i in range(3):
403+
asset_query = assets[i]
404+
result = idx2.search_assets(asset_query, limit=10)
405+
assert any(m.iscc_id == sample_iscc_ids[i] for m in result.matches)
406+
407+
idx2.close()

0 commit comments

Comments
 (0)