feat(usearch): add crash recovery for missing .usearch files

titusz · titusz · commit 9895e602a21d · 2025-10-29T12:02:08.000+01:00
Implement automatic detection and rebuilding of missing .usearch files
from LMDB metadata on index startup. This handles crash scenarios where
vectors were added to LMDB but .usearch files were never flushed to disk.

- Add _get_all_tracked_unit_types() to scan metadata for tracked unit_types
- Enhance _load_nphd_indexes() to detect orphaned metadata and trigger rebuilds
- Add comprehensive test coverage for single and multiple missing file scenarios
- Test edge cases including empty databases and natural loop exhaustion

Resolves data loss scenarios during ungraceful shutdowns or crashes.
diff --git a/iscc_search/indexes/usearch/index.py b/iscc_search/indexes/usearch/index.py
@@ -458,16 +458,54 @@ def _get_nphd_metadata(self, unit_type):
         except lmdb.ReadonlyError:  # pragma: no cover
             return None
 
+    def _get_all_tracked_unit_types(self):
+        # type: () -> set[str]
+        """
+        Get all unit_types tracked in LMDB metadata.
+
+        Scans metadata database for all 'nphd_count:*' keys and extracts unit_types.
+
+        :return: Set of unit_type identifiers that have been indexed
+        """
+        unit_types = set()  # type: set[str]
+        prefix = b"nphd_count:"
+
+        try:
+            with self.env.begin() as txn:
+                metadata_db = self.env.open_db(b"__metadata__", txn=txn)
+                cursor = txn.cursor(metadata_db)
+
+                # Seek to first key matching prefix
+                if cursor.set_range(prefix):
+                    for key_bytes, _ in cursor:
+                        if not key_bytes.startswith(prefix):
+                            break
+                        # Extract unit_type from key (format: "nphd_count:UNIT_TYPE")
+                        unit_type = key_bytes[len(prefix) :].decode()
+                        unit_types.add(unit_type)
+
+        except lmdb.ReadonlyError:  # pragma: no cover
+            # Database doesn't exist yet (empty index)
+            pass
+
+        return unit_types
+
     def _load_nphd_indexes(self):
         # type: () -> None
         """
         Load existing NphdIndex files with auto-rebuild on sync mismatch.
 
         Compares actual vector count in .usearch file with expected count
         in LMDB metadata. Triggers full rebuild from LMDB if out of sync.
+        Also rebuilds missing .usearch files for unit_types tracked in metadata
+        (crash recovery for unflushed indexes).
         """
+        # Track which unit_types we've loaded from disk
+        loaded_unit_types = set()  # type: set[str]
+
         for usearch_file in self.path.glob("*.usearch"):
             unit_type = usearch_file.stem  # Filename without extension
+            loaded_unit_types.add(unit_type)
             try:
                 # Note: restore() gets max_dim from saved metadata, don't pass it
                 nphd_index = NphdIndex.restore(str(usearch_file))
@@ -497,6 +535,19 @@ def _load_nphd_indexes(self):
                 logger.warning(f"Failed to load NphdIndex '{usearch_file}': {e}. Rebuilding...")
                 self._rebuild_nphd_index(unit_type)
 
+        # Check for orphaned metadata (tracked unit_types without .usearch files)
+        # This handles crash recovery when vectors were added but never flushed
+        tracked_unit_types = self._get_all_tracked_unit_types()
+        missing_unit_types = tracked_unit_types - loaded_unit_types
+
+        if missing_unit_types:
+            logger.warning(
+                f"Found {len(missing_unit_types)} unit_type(s) in metadata without .usearch files: "
+                f"{sorted(missing_unit_types)}. Rebuilding from LMDB (crash recovery)..."
+            )
+            for unit_type in missing_unit_types:
+                self._rebuild_nphd_index(unit_type)
+
     def _rebuild_nphd_index(self, unit_type):
         # type: (str) -> None
         """
diff --git a/tests/test_indexes_usearch_persistence.py b/tests/test_indexes_usearch_persistence.py
@@ -212,3 +212,196 @@ def test_usearch_index_no_save_on_add(tmp_path, sample_iscc_ids):
 
     # Now file should exist
     assert usearch_file.exists(), "NphdIndex file should exist after close()"
+
+
+def test_usearch_index_crash_recovery_rebuild_missing_files(tmp_path, sample_iscc_ids):
+    """
+    Test crash recovery: rebuild missing .usearch files from LMDB on startup.
+
+    Simulates a crash scenario where vectors were added but never flushed,
+    leaving metadata in LMDB but no .usearch file on disk.
+    """
+    index_path = tmp_path / "crash_recovery"
+
+    # Create index and add assets with multiple unit types
+    idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
+    content_unit = ic.gen_text_code_v0("Test content for crash recovery")["iscc"]
+    data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
+    instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
+
+    asset = IsccAsset(
+        iscc_id=sample_iscc_ids[0],
+        units=[instance_unit, content_unit, data_unit],
+    )
+    idx.add_assets([asset])
+
+    # Verify metadata exists but files don't
+    assert idx._get_nphd_metadata("CONTENT_TEXT_V0") == 1
+    assert idx._get_nphd_metadata("DATA_NONE_V0") == 1
+
+    content_file = index_path / "CONTENT_TEXT_V0.usearch"
+    data_file = index_path / "DATA_NONE_V0.usearch"
+    assert not content_file.exists(), "Files should not exist before close()"
+    assert not data_file.exists(), "Files should not exist before close()"
+
+    # Simulate crash: close LMDB but DON'T save NphdIndex files
+    # This leaves metadata in LMDB but no .usearch files on disk
+    idx.env.close()  # Direct env close, bypassing UsearchIndex.close()
+
+    # Verify files still don't exist (simulating crash before flush)
+    assert not content_file.exists()
+    assert not data_file.exists()
+
+    # Reopen index - should detect missing files and auto-rebuild from LMDB
+    idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
+
+    # Verify files were created by auto-rebuild
+    assert content_file.exists(), "Missing file should be rebuilt on startup"
+    assert data_file.exists(), "Missing file should be rebuilt on startup"
+
+    # Verify data is accessible via search (proving rebuild worked)
+    query = IsccAsset(units=[instance_unit, content_unit, data_unit])
+    result = idx2.search_assets(query, limit=10)
+    assert len(result.matches) == 1
+    assert result.matches[0].iscc_id == sample_iscc_ids[0]
+
+    # Verify both unit types are searchable
+    assert "CONTENT_TEXT_V0" in result.matches[0].matches
+    assert "DATA_NONE_V0" in result.matches[0].matches
+
+    idx2.close()
+
+
+def test_usearch_index_get_all_tracked_unit_types(tmp_path, sample_iscc_ids):
+    """Test _get_all_tracked_unit_types correctly scans metadata."""
+    index_path = tmp_path / "tracked_types"
+
+    idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
+
+    # Initially empty
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == set()
+
+    # Add assets with different unit types
+    content_unit = ic.gen_text_code_v0("Test content")["iscc"]
+    data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
+    instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
+
+    asset1 = IsccAsset(
+        iscc_id=sample_iscc_ids[0],
+        units=[instance_unit, content_unit],
+    )
+    asset2 = IsccAsset(
+        iscc_id=sample_iscc_ids[1],
+        units=[instance_unit, data_unit],
+    )
+    idx.add_assets([asset1, asset2])
+
+    # Check tracked types (INSTANCE is not similarity unit, shouldn't appear)
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0"}
+
+    # Test natural loop exhaustion: delete realm_id so nphd_count keys are last in DB
+    # This tests the branch where the for loop completes naturally without breaking
+    import struct
+
+    with idx.env.begin(write=True) as txn:
+        metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
+        txn.delete(b"realm_id", db=metadata_db)
+
+    # Should still find the tracked types (tests 480->491: loop exhausts naturally)
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0"}
+
+    # Test edge case 1: delete nphd_count keys but keep realm_id (add it back first)
+    with idx.env.begin(write=True) as txn:
+        metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
+        # Add realm_id back
+        txn.put(b"realm_id", struct.pack(">I", 0), db=metadata_db)
+
+    # Test edge case 2: delete nphd_count keys but keep realm_id
+    # This tests the branch where set_range returns True but loop breaks immediately
+    with idx.env.begin(write=True) as txn:
+        metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
+        cursor = txn.cursor(metadata_db)
+        # Delete all nphd_count:* keys
+        prefix = b"nphd_count:"
+        if cursor.set_range(prefix):
+            keys_to_delete = []
+            for key_bytes, _ in cursor:
+                if not key_bytes.startswith(prefix):
+                    break
+                keys_to_delete.append(key_bytes)
+            for key in keys_to_delete:
+                txn.delete(key, db=metadata_db)
+
+    # Should return empty set (tests 480->491 branch: loop breaks on first non-matching key)
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == set()
+
+    # Test edge case 2: delete realm_id too so set_range returns False
+    with idx.env.begin(write=True) as txn:
+        metadata_db = idx.env.open_db(b"__metadata__", txn=txn)
+        txn.delete(b"realm_id", db=metadata_db)
+
+    # Should return empty set (tests 479->491 branch: set_range returns False)
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == set()
+
+    idx.close()
+
+
+def test_usearch_index_crash_recovery_multiple_missing_files(tmp_path, sample_iscc_ids):
+    """
+    Test crash recovery with multiple missing .usearch files.
+
+    Verifies that all tracked unit_types are rebuilt when their files are missing.
+    """
+    index_path = tmp_path / "multi_crash_recovery"
+
+    # Create index and add assets with three different similarity unit types
+    idx = UsearchIndex(index_path, realm_id=0, max_dim=256)
+
+    assets = []
+    for i in range(3):
+        content_unit = ic.gen_text_code_v0(f"Content {i}")["iscc"]
+        data_unit = f"ISCC:{ic.Code.rnd(ic.MT.DATA, bits=128)}"
+        meta_unit = ic.gen_meta_code_v0(f"Asset {i}")["iscc"]
+        instance_unit = f"ISCC:{ic.Code.rnd(ic.MT.INSTANCE, bits=128)}"
+
+        assets.append(
+            IsccAsset(
+                iscc_id=sample_iscc_ids[i],
+                units=[instance_unit, content_unit, data_unit, meta_unit],
+            )
+        )
+
+    idx.add_assets(assets)
+
+    # Verify metadata tracked for all three types
+    tracked = idx._get_all_tracked_unit_types()
+    assert tracked == {"CONTENT_TEXT_V0", "DATA_NONE_V0", "META_NONE_V0"}
+
+    # Simulate crash
+    idx.env.close()
+
+    # Verify no .usearch files exist
+    assert not (index_path / "CONTENT_TEXT_V0.usearch").exists()
+    assert not (index_path / "DATA_NONE_V0.usearch").exists()
+    assert not (index_path / "META_NONE_V0.usearch").exists()
+
+    # Reopen - should rebuild all three missing files
+    idx2 = UsearchIndex(index_path, realm_id=0, max_dim=256)
+
+    # Verify all files were created
+    assert (index_path / "CONTENT_TEXT_V0.usearch").exists()
+    assert (index_path / "DATA_NONE_V0.usearch").exists()
+    assert (index_path / "META_NONE_V0.usearch").exists()
+
+    # Verify all three assets are searchable
+    for i in range(3):
+        asset_query = assets[i]
+        result = idx2.search_assets(asset_query, limit=10)
+        assert any(m.iscc_id == sample_iscc_ids[i] for m in result.matches)
+
+    idx2.close()