@@ -212,3 +212,196 @@ def test_usearch_index_no_save_on_add(tmp_path, sample_iscc_ids):
212212
213213 # Now file should exist
214214 assert usearch_file .exists (), "NphdIndex file should exist after close()"
215+
216+
217+ def test_usearch_index_crash_recovery_rebuild_missing_files (tmp_path , sample_iscc_ids ):
218+ """
219+ Test crash recovery: rebuild missing .usearch files from LMDB on startup.
220+
221+ Simulates a crash scenario where vectors were added but never flushed,
222+ leaving metadata in LMDB but no .usearch file on disk.
223+ """
224+ index_path = tmp_path / "crash_recovery"
225+
226+ # Create index and add assets with multiple unit types
227+ idx = UsearchIndex (index_path , realm_id = 0 , max_dim = 256 )
228+ content_unit = ic .gen_text_code_v0 ("Test content for crash recovery" )["iscc" ]
229+ data_unit = f"ISCC:{ ic .Code .rnd (ic .MT .DATA , bits = 128 )} "
230+ instance_unit = f"ISCC:{ ic .Code .rnd (ic .MT .INSTANCE , bits = 128 )} "
231+
232+ asset = IsccAsset (
233+ iscc_id = sample_iscc_ids [0 ],
234+ units = [instance_unit , content_unit , data_unit ],
235+ )
236+ idx .add_assets ([asset ])
237+
238+ # Verify metadata exists but files don't
239+ assert idx ._get_nphd_metadata ("CONTENT_TEXT_V0" ) == 1
240+ assert idx ._get_nphd_metadata ("DATA_NONE_V0" ) == 1
241+
242+ content_file = index_path / "CONTENT_TEXT_V0.usearch"
243+ data_file = index_path / "DATA_NONE_V0.usearch"
244+ assert not content_file .exists (), "Files should not exist before close()"
245+ assert not data_file .exists (), "Files should not exist before close()"
246+
247+ # Simulate crash: close LMDB but DON'T save NphdIndex files
248+ # This leaves metadata in LMDB but no .usearch files on disk
249+ idx .env .close () # Direct env close, bypassing UsearchIndex.close()
250+
251+ # Verify files still don't exist (simulating crash before flush)
252+ assert not content_file .exists ()
253+ assert not data_file .exists ()
254+
255+ # Reopen index - should detect missing files and auto-rebuild from LMDB
256+ idx2 = UsearchIndex (index_path , realm_id = 0 , max_dim = 256 )
257+
258+ # Verify files were created by auto-rebuild
259+ assert content_file .exists (), "Missing file should be rebuilt on startup"
260+ assert data_file .exists (), "Missing file should be rebuilt on startup"
261+
262+ # Verify data is accessible via search (proving rebuild worked)
263+ query = IsccAsset (units = [instance_unit , content_unit , data_unit ])
264+ result = idx2 .search_assets (query , limit = 10 )
265+ assert len (result .matches ) == 1
266+ assert result .matches [0 ].iscc_id == sample_iscc_ids [0 ]
267+
268+ # Verify both unit types are searchable
269+ assert "CONTENT_TEXT_V0" in result .matches [0 ].matches
270+ assert "DATA_NONE_V0" in result .matches [0 ].matches
271+
272+ idx2 .close ()
273+
274+
275+ def test_usearch_index_get_all_tracked_unit_types (tmp_path , sample_iscc_ids ):
276+ """Test _get_all_tracked_unit_types correctly scans metadata."""
277+ index_path = tmp_path / "tracked_types"
278+
279+ idx = UsearchIndex (index_path , realm_id = 0 , max_dim = 256 )
280+
281+ # Initially empty
282+ tracked = idx ._get_all_tracked_unit_types ()
283+ assert tracked == set ()
284+
285+ # Add assets with different unit types
286+ content_unit = ic .gen_text_code_v0 ("Test content" )["iscc" ]
287+ data_unit = f"ISCC:{ ic .Code .rnd (ic .MT .DATA , bits = 128 )} "
288+ instance_unit = f"ISCC:{ ic .Code .rnd (ic .MT .INSTANCE , bits = 128 )} "
289+
290+ asset1 = IsccAsset (
291+ iscc_id = sample_iscc_ids [0 ],
292+ units = [instance_unit , content_unit ],
293+ )
294+ asset2 = IsccAsset (
295+ iscc_id = sample_iscc_ids [1 ],
296+ units = [instance_unit , data_unit ],
297+ )
298+ idx .add_assets ([asset1 , asset2 ])
299+
300+ # Check tracked types (INSTANCE is not similarity unit, shouldn't appear)
301+ tracked = idx ._get_all_tracked_unit_types ()
302+ assert tracked == {"CONTENT_TEXT_V0" , "DATA_NONE_V0" }
303+
304+ # Test natural loop exhaustion: delete realm_id so nphd_count keys are last in DB
305+ # This tests the branch where the for loop completes naturally without breaking
306+ import struct
307+
308+ with idx .env .begin (write = True ) as txn :
309+ metadata_db = idx .env .open_db (b"__metadata__" , txn = txn )
310+ txn .delete (b"realm_id" , db = metadata_db )
311+
312+ # Should still find the tracked types (tests 480->491: loop exhausts naturally)
313+ tracked = idx ._get_all_tracked_unit_types ()
314+ assert tracked == {"CONTENT_TEXT_V0" , "DATA_NONE_V0" }
315+
316+ # Test edge case 1: delete nphd_count keys but keep realm_id (add it back first)
317+ with idx .env .begin (write = True ) as txn :
318+ metadata_db = idx .env .open_db (b"__metadata__" , txn = txn )
319+ # Add realm_id back
320+ txn .put (b"realm_id" , struct .pack (">I" , 0 ), db = metadata_db )
321+
322+ # Test edge case 2: delete nphd_count keys but keep realm_id
323+ # This tests the branch where set_range returns True but loop breaks immediately
324+ with idx .env .begin (write = True ) as txn :
325+ metadata_db = idx .env .open_db (b"__metadata__" , txn = txn )
326+ cursor = txn .cursor (metadata_db )
327+ # Delete all nphd_count:* keys
328+ prefix = b"nphd_count:"
329+ if cursor .set_range (prefix ):
330+ keys_to_delete = []
331+ for key_bytes , _ in cursor :
332+ if not key_bytes .startswith (prefix ):
333+ break
334+ keys_to_delete .append (key_bytes )
335+ for key in keys_to_delete :
336+ txn .delete (key , db = metadata_db )
337+
338+ # Should return empty set (tests 480->491 branch: loop breaks on first non-matching key)
339+ tracked = idx ._get_all_tracked_unit_types ()
340+ assert tracked == set ()
341+
342+ # Test edge case 2: delete realm_id too so set_range returns False
343+ with idx .env .begin (write = True ) as txn :
344+ metadata_db = idx .env .open_db (b"__metadata__" , txn = txn )
345+ txn .delete (b"realm_id" , db = metadata_db )
346+
347+ # Should return empty set (tests 479->491 branch: set_range returns False)
348+ tracked = idx ._get_all_tracked_unit_types ()
349+ assert tracked == set ()
350+
351+ idx .close ()
352+
353+
354+ def test_usearch_index_crash_recovery_multiple_missing_files (tmp_path , sample_iscc_ids ):
355+ """
356+ Test crash recovery with multiple missing .usearch files.
357+
358+ Verifies that all tracked unit_types are rebuilt when their files are missing.
359+ """
360+ index_path = tmp_path / "multi_crash_recovery"
361+
362+ # Create index and add assets with three different similarity unit types
363+ idx = UsearchIndex (index_path , realm_id = 0 , max_dim = 256 )
364+
365+ assets = []
366+ for i in range (3 ):
367+ content_unit = ic .gen_text_code_v0 (f"Content { i } " )["iscc" ]
368+ data_unit = f"ISCC:{ ic .Code .rnd (ic .MT .DATA , bits = 128 )} "
369+ meta_unit = ic .gen_meta_code_v0 (f"Asset { i } " )["iscc" ]
370+ instance_unit = f"ISCC:{ ic .Code .rnd (ic .MT .INSTANCE , bits = 128 )} "
371+
372+ assets .append (
373+ IsccAsset (
374+ iscc_id = sample_iscc_ids [i ],
375+ units = [instance_unit , content_unit , data_unit , meta_unit ],
376+ )
377+ )
378+
379+ idx .add_assets (assets )
380+
381+ # Verify metadata tracked for all three types
382+ tracked = idx ._get_all_tracked_unit_types ()
383+ assert tracked == {"CONTENT_TEXT_V0" , "DATA_NONE_V0" , "META_NONE_V0" }
384+
385+ # Simulate crash
386+ idx .env .close ()
387+
388+ # Verify no .usearch files exist
389+ assert not (index_path / "CONTENT_TEXT_V0.usearch" ).exists ()
390+ assert not (index_path / "DATA_NONE_V0.usearch" ).exists ()
391+ assert not (index_path / "META_NONE_V0.usearch" ).exists ()
392+
393+ # Reopen - should rebuild all three missing files
394+ idx2 = UsearchIndex (index_path , realm_id = 0 , max_dim = 256 )
395+
396+ # Verify all files were created
397+ assert (index_path / "CONTENT_TEXT_V0.usearch" ).exists ()
398+ assert (index_path / "DATA_NONE_V0.usearch" ).exists ()
399+ assert (index_path / "META_NONE_V0.usearch" ).exists ()
400+
401+ # Verify all three assets are searchable
402+ for i in range (3 ):
403+ asset_query = assets [i ]
404+ result = idx2 .search_assets (asset_query , limit = 10 )
405+ assert any (m .iscc_id == sample_iscc_ids [i ] for m in result .matches )
406+
407+ idx2 .close ()
0 commit comments