@@ -288,11 +288,11 @@ def test_ivf_flat_ingestion_with_updates(tmp_path):
288288 dataset_dir = os .path .join (tmp_path , "dataset" )
289289 index_uri = os .path .join (tmp_path , "array" )
290290 k = 10
291- size = 100000
292- partitions = 100
291+ size = 1000
292+ partitions = 10
293293 dimensions = 128
294294 nqueries = 100
295- nprobe = 20
295+ nprobe = 10
296296 data = create_random_dataset_u8 (nb = size , d = dimensions , nq = nqueries , k = k , path = dataset_dir )
297297 dtype = np .uint8
298298
@@ -303,24 +303,23 @@ def test_ivf_flat_ingestion_with_updates(tmp_path):
303303 index_uri = index_uri ,
304304 source_uri = os .path .join (dataset_dir , "data.u8bin" ),
305305 partitions = partitions ,
306- input_vectors_per_work_item = int (size / 10 ),
307306 )
308307 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
309- assert accuracy (result , gt_i ) > MINIMUM_ACCURACY
308+ assert accuracy (result , gt_i ) == 1.0
310309
311310 update_ids_offset = MAX_UINT64 - size
312311 updated_ids = {}
313312 for i in range (100 ):
314313 index .delete (external_id = i )
315314 index .update (vector = data [i ].astype (dtype ), external_id = i + update_ids_offset )
316- updated_ids [i + update_ids_offset ] = i
315+ updated_ids [i ] = i + update_ids_offset
317316
318317 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
319- assert accuracy (result , gt_i , updated_ids = updated_ids ) > MINIMUM_ACCURACY
318+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
320319
321320 index = index .consolidate_updates ()
322321 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
323- assert accuracy (result , gt_i , updated_ids = updated_ids ) > MINIMUM_ACCURACY
322+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
324323
325324def test_ivf_flat_ingestion_with_batch_updates (tmp_path ):
326325 dataset_dir = os .path .join (tmp_path , "dataset" )
@@ -330,7 +329,7 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path):
330329 partitions = 100
331330 dimensions = 128
332331 nqueries = 100
333- nprobe = 20
332+ nprobe = 100
334333 data = create_random_dataset_u8 (nb = size , d = dimensions , nq = nqueries , k = k , path = dataset_dir )
335334 dtype = np .uint8
336335
@@ -344,18 +343,18 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path):
344343 input_vectors_per_work_item = int (size / 10 ),
345344 )
346345 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
347- assert accuracy (result , gt_i ) > MINIMUM_ACCURACY
346+ assert accuracy (result , gt_i ) > 0.99
348347
349348 update_ids = {}
350349 updated_ids = {}
351350 update_ids_offset = MAX_UINT64 - size
352351 for i in range (0 , 100000 , 2 ):
353- update_ids [i ] = i + update_ids_offset
354- updated_ids [i + update_ids_offset ] = i
355- external_ids = np .zeros ((len (update_ids ) * 2 ), dtype = np .uint64 )
356- updates = np .empty ((len (update_ids ) * 2 ), dtype = 'O' )
352+ updated_ids [i ] = i + update_ids_offset
353+ update_ids [i + update_ids_offset ] = i
354+ external_ids = np .zeros ((len (updated_ids ) * 2 ), dtype = np .uint64 )
355+ updates = np .empty ((len (updated_ids ) * 2 ), dtype = 'O' )
357356 id = 0
358- for prev_id , new_id in update_ids .items ():
357+ for prev_id , new_id in updated_ids .items ():
359358 external_ids [id ] = prev_id
360359 updates [id ] = np .array ([], dtype = dtype )
361360 id += 1
@@ -365,9 +364,102 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path):
365364
366365 index .update_batch (vectors = updates , external_ids = external_ids )
367366 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
368- assert accuracy (result , gt_i , updated_ids = updated_ids ) > MINIMUM_ACCURACY
367+ assert accuracy (result , gt_i , updated_ids = updated_ids ) > 0.99
369368
370369 index = index .consolidate_updates ()
371370 _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
372- assert accuracy (result , gt_i , updated_ids = updated_ids ) > MINIMUM_ACCURACY
371+ assert accuracy (result , gt_i , updated_ids = updated_ids ) > 0.99
372+
373+ def test_ivf_flat_ingestion_with_updates_and_timetravel (tmp_path ):
374+ dataset_dir = os .path .join (tmp_path , "dataset" )
375+ index_uri = os .path .join (tmp_path , "array" )
376+ k = 10
377+ size = 1000
378+ partitions = 10
379+ dimensions = 128
380+ nqueries = 100
381+ nprobe = 10
382+ data = create_random_dataset_u8 (nb = size , d = dimensions , nq = nqueries , k = k , path = dataset_dir )
383+ dtype = np .uint8
384+
385+ query_vectors = get_queries (dataset_dir , dtype = dtype )
386+ gt_i , gt_d = get_groundtruth (dataset_dir , k )
387+ index = ingest (
388+ index_type = "IVF_FLAT" ,
389+ index_uri = index_uri ,
390+ source_uri = os .path .join (dataset_dir , "data.u8bin" ),
391+ partitions = partitions ,
392+ index_timestamp = 1 ,
393+ )
394+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
395+ assert accuracy (result , gt_i ) == 1.0
373396
397+ update_ids_offset = MAX_UINT64 - size
398+ updated_ids = {}
399+ for i in range (2 , 102 ):
400+ index .delete (external_id = i , timestamp = i )
401+ index .update (vector = data [i ].astype (dtype ), external_id = i + update_ids_offset , timestamp = i )
402+ updated_ids [i ] = i + update_ids_offset
403+
404+ index = IVFFlatIndex (uri = index_uri , timestamp = 101 )
405+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
406+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
407+ index = IVFFlatIndex (uri = index_uri , timestamp = (0 , 101 ))
408+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
409+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
410+ index = IVFFlatIndex (uri = index_uri , timestamp = (2 , 101 ))
411+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
412+ assert 0.05 <= accuracy (result , gt_i , updated_ids = updated_ids , only_updated_ids = True ) <= 0.15
413+
414+ # Timetravel with partial read from updates table
415+ updated_ids_part = {}
416+ for i in range (2 , 52 ):
417+ updated_ids_part [i ] = i + update_ids_offset
418+ index = IVFFlatIndex (uri = index_uri , timestamp = 51 )
419+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
420+ assert accuracy (result , gt_i , updated_ids = updated_ids_part ) == 1.0
421+ index = IVFFlatIndex (uri = index_uri , timestamp = (0 , 51 ))
422+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
423+ assert accuracy (result , gt_i , updated_ids = updated_ids_part ) == 1.0
424+ index = IVFFlatIndex (uri = index_uri , timestamp = (2 , 51 ))
425+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
426+ assert 0.02 <= accuracy (result , gt_i , updated_ids = updated_ids , only_updated_ids = True ) <= 0.07
427+
428+ # Timetravel at previous ingestion timestamp
429+ index = IVFFlatIndex (uri = index_uri , timestamp = 1 )
430+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
431+ assert accuracy (result , gt_i ) == 1.0
432+
433+ # Consolidate updates
434+ index = index .consolidate_updates ()
435+ index = IVFFlatIndex (uri = index_uri , timestamp = 101 )
436+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
437+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
438+ index = IVFFlatIndex (uri = index_uri , timestamp = (0 , 101 ))
439+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
440+ assert accuracy (result , gt_i , updated_ids = updated_ids ) == 1.0
441+ index = IVFFlatIndex (uri = index_uri , timestamp = (2 , 101 ))
442+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
443+ assert 0.05 <= accuracy (result , gt_i , updated_ids = updated_ids , only_updated_ids = True ) <= 0.15
444+
445+ # Timetravel with partial read from updates table
446+ updated_ids_part = {}
447+ for i in range (2 , 52 ):
448+ updated_ids_part [i ] = i + update_ids_offset
449+ index = IVFFlatIndex (uri = index_uri , timestamp = 51 )
450+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
451+ assert accuracy (result , gt_i , updated_ids = updated_ids_part ) == 1.0
452+ index = IVFFlatIndex (uri = index_uri , timestamp = (0 , 51 ))
453+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
454+ assert accuracy (result , gt_i , updated_ids = updated_ids_part ) == 1.0
455+ index = IVFFlatIndex (uri = index_uri , timestamp = (2 , 51 ))
456+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
457+ assert 0.02 <= accuracy (result , gt_i , updated_ids = updated_ids , only_updated_ids = True ) <= 0.07
458+
459+ # Timetravel at previous ingestion timestamp
460+ index = IVFFlatIndex (uri = index_uri , timestamp = 1 )
461+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
462+ assert accuracy (result , gt_i ) == 1.0
463+ index = IVFFlatIndex (uri = index_uri , timestamp = (0 , 1 ))
464+ _ , result = index .query (query_vectors , k = k , nprobe = nprobe )
465+ assert accuracy (result , gt_i ) == 1.0
0 commit comments