Add update test

Nikos Papailiou · Nikos Papailiou · commit 23f5a515753b · 2023-09-01T16:24:50.000+03:00
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -103,7 +103,9 @@ def query_additions(self, queries: np.ndarray, k):
 
     def update(self, vector: np.array, external_id: np.uint64):
         updates_array = self.open_updates_array()
-        updates_array[external_id] = vector
+        vectors = np.empty((1), dtype='O')
+        vectors[0] = vector
+        updates_array[external_id] = {'vector': vectors}
         updates_array.close()
         self.consolidate_update_fragments()
 
@@ -115,7 +117,9 @@ def update_batch(self, vectors: np.ndarray, external_ids: np.array):
 
     def delete(self, external_id: np.uint64):
         updates_array = self.open_updates_array()
-        updates_array[external_id] = np.array([], dtype=self.dtype)
+        deletes = np.empty((1), dtype='O')
+        deletes[0] = np.array([], dtype=self.dtype)
+        updates_array[external_id] =  {'vector': deletes}
         updates_array.close()
         self.consolidate_update_fragments()
 
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -307,13 +307,50 @@ def test_ivf_flat_ingestion_with_updates(tmp_path):
     _, result = index.query(query_vectors, k=k, nprobe=nprobe)
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
+    updated_ids = {}
+    for i in range(100):
+        index.delete(external_id=i)
+        index.update(vector=data[i].astype(dtype), external_id=i + 1000000)
+        updated_ids[i + 1000000] = i
+
+    _, result = index.query(query_vectors, k=k, nprobe=nprobe)
+    assert accuracy(result, gt_i, updated_ids=updated_ids) > MINIMUM_ACCURACY
+
+    index = index.consolidate_updates()
+    _, result = index.query(query_vectors, k=k, nprobe=nprobe)
+    assert accuracy(result, gt_i, updated_ids=updated_ids) > MINIMUM_ACCURACY
+
+def test_ivf_flat_ingestion_with_batch_updates(tmp_path):
+    dataset_dir = os.path.join(tmp_path, "dataset")
+    index_uri = os.path.join(tmp_path, "array")
+    k = 10
+    size = 100000
+    partitions = 100
+    dimensions = 128
+    nqueries = 100
+    nprobe = 20
+    data = create_random_dataset_u8(nb=size, d=dimensions, nq=nqueries, k=k, path=dataset_dir)
+    dtype = np.uint8
+
+    query_vectors = get_queries(dataset_dir, dtype=dtype)
+    gt_i, gt_d = get_groundtruth(dataset_dir, k)
+    index = ingest(
+        index_type="IVF_FLAT",
+        index_uri=index_uri,
+        source_uri=os.path.join(dataset_dir, "data.u8bin"),
+        partitions=partitions,
+        input_vectors_per_work_item=int(size / 10),
+    )
+    _, result = index.query(query_vectors, k=k, nprobe=nprobe)
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
     update_ids = {}
     updated_ids = {}
     for i in range(0, 100000, 2):
         update_ids[i] = i + 1000000
         updated_ids[i + 1000000] = i
-    external_ids = np.zeros((len(update_ids)*2), dtype=np.uint64)
-    updates = np.empty((len(update_ids)*2), dtype='O')
+    external_ids = np.zeros((len(update_ids) * 2), dtype=np.uint64)
+    updates = np.empty((len(update_ids) * 2), dtype='O')
     id = 0
     for prev_id, new_id in update_ids.items():
         external_ids[id] = prev_id