Have consolidate_updates() reuse existing centroids by default with an option to re-compute them (#178)

jparismorgan · web-flow · commit bb433dbdf329 · 2023-12-21T12:02:12.000+01:00
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -354,9 +354,22 @@ def open_updates_array(self, timestamp: int = None):
                 timestamp = int(time.time() * 1000)
             return tiledb.open(self.updates_array_uri, mode="w", timestamp=timestamp)
 
-    def consolidate_updates(self, **kwargs):
+    def consolidate_updates(
+        self, 
+        retrain_index: bool = False,
+        **kwargs
+    ):
+        """
+        Parameters
+        ----------
+        retrain_index: bool
+            If true, retrain the index. If false, reuse data from the previous index. 
+            For IVF_FLAT retraining means we will recompute the centroids - when doing so you can 
+            pass any ingest() arguments used to configure computing centroids and we will use them 
+            when recomputing the centroids. Otherwise, if false, we will reuse the centroids from 
+            the previous index.
+        """
         from tiledb.vector_search.ingestion import ingest
-
         fragments_info = tiledb.array_fragments(
             self.updates_array_uri, ctx=tiledb.Ctx(self.config)
         )
@@ -371,6 +384,15 @@ def consolidate_updates(self, **kwargs):
         tiledb.consolidate(self.updates_array_uri, config=conf)
         tiledb.vacuum(self.updates_array_uri, config=conf)
 
+        # We don't copy the centroids if self.partitions=0 because this means our index was previously empty.
+        should_pass_copy_centroids_uri = self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0
+        if should_pass_copy_centroids_uri:
+            # Make sure the user didn't pass an incorrect number of partitions.
+            if 'partitions' in kwargs and self.partitions != kwargs['partitions']:
+                raise ValueError(f"The passed partitions={kwargs['partitions']} is different than the number of partitions ({self.partitions}) from when the index was created - this is an issue because with retrain_index=True, the partitions from the previous index will be used; to fix, set retrain_index=False, don't pass partitions, or pass the correct number of partitions.")
+            # We pass partitions through kwargs so that we don't pass it twice.
+            kwargs['partitions'] = self.partitions
+
         new_index = ingest(
             index_type=self.index_type,
             index_uri=self.uri,
@@ -381,6 +403,7 @@ def consolidate_updates(self, **kwargs):
             updates_uri=self.updates_array_uri,
             index_timestamp=max_timestamp,
             storage_version=self.storage_version,
+            copy_centroids_uri=self.centroids_uri if should_pass_copy_centroids_uri else None,
             config=self.config,
             **kwargs,
         )
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -159,6 +159,11 @@ def ingest(
     if training_sample_size < -1:
         raise ValueError("training_sample_size should either be positive or -1 (to auto-configure based on the dataset sizes)")
 
+    if copy_centroids_uri is not None and training_sample_size != -1:
+        raise ValueError("training_sample_size should not be provided alongside copy_centroids_uri")
+    if copy_centroids_uri is not None and partitions == -1:
+        raise ValueError("partitions should be provided if copy_centroids_uri is provided (set partitions to the number of centroids in copy_centroids_uri)")
+
     if index_type != "IVF_FLAT" and training_sample_size != -1:
         raise ValueError("training_sample_size should only be provided with index_type IVF_FLAT")
     for variable in ["copy_centroids_uri", "training_input_vectors", "training_source_uri", "training_source_type"]:
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -365,7 +365,7 @@ def test_ivf_flat_ingestion_with_updates(tmp_path):
     _, result = index.query(queries, k=k, nprobe=nprobe)
     assert accuracy(result, gt_i, updated_ids=updated_ids) == 1.0
 
-    index = index.consolidate_updates(partitions=20)
+    index = index.consolidate_updates(retrain_index=True, partitions=20)
     _, result = index.query(queries, k=k, nprobe=20)
     assert accuracy(result, gt_i, updated_ids=updated_ids) == 1.0
 
@@ -733,7 +733,7 @@ def test_storage_versions(tmp_path):
             _, result = index.query(queries, k=k)
             assert accuracy(result, gt_i, updated_ids=updated_ids) >= MINIMUM_ACCURACY
 
-            index = index.consolidate_updates(partitions=20)
+            index = index.consolidate_updates(retrain_index=True, partitions=20)
             _, result = index.query(queries, k=k)
             assert accuracy(result, gt_i, updated_ids=updated_ids) >= MINIMUM_ACCURACY
 
@@ -776,7 +776,8 @@ def test_copy_centroids_uri(tmp_path):
         index_type="IVF_FLAT", 
         index_uri=index_uri, 
         input_vectors=data,
-        copy_centroids_uri=centroids_uri
+        copy_centroids_uri=centroids_uri,
+        partitions=centroids_in_size
     )
 
     # Query the index.
@@ -931,8 +932,48 @@ def test_ingest_with_training_source_uri_tdb(tmp_path):
     queries = np.array([data.transpose()[1]], dtype=np.float32)
     query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1]])
 
+    update_vectors = np.empty([3], dtype=object)
+    update_vectors[0] = np.array([6.0, 6.1, 6.2, 6.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([7.0, 7.1, 7.2, 7.3], dtype=np.dtype(np.float32))
+    update_vectors[2] = np.array([8.0, 8.1, 8.2, 8.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1000, 1001, 1002]))
+    
+    index = index.consolidate_updates()
+
+    queries = np.array([update_vectors[2]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1002]])
+
+    ################################################################################################
+    # Test we can load the index again and query, update, and consolidate.
+    ################################################################################################
+    # Load the index again and query.
     index = IVFFlatIndex(uri=index_uri)
-    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1]])
+
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1002]])
+    
+    # Update the index and query.
+    update_vectors = np.empty([2], dtype=object)
+    update_vectors[0] = np.array([9.0, 9.1, 9.2, 9.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([10.0, 10.1, 10.2, 10.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1003, 1004]))
+    index = index.consolidate_updates()
+    
+    queries = np.array([update_vectors[0]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1003]])
+
+    # Clear the index history, load, update, and query.
+    Index.clear_history(uri=index_uri, timestamp=index.latest_ingestion_timestamp - 1)
+
+    index = IVFFlatIndex(uri=index_uri)
+
+    update_vectors = np.empty([2], dtype=object)
+    update_vectors[0] = np.array([11.0, 11.1, 11.2, 11.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([12.0, 12.1, 12.2, 12.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1003, 1004]))
+    index = index.consolidate_updates()
+
+    queries = np.array([update_vectors[0]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1003]])
 
     ###############################################################################################
     # Also test that we can ingest with training_source_type.
@@ -984,5 +1025,39 @@ def test_ingest_with_training_source_uri_numpy(tmp_path):
     queries = np.array([data[1]], dtype=np.float32)
     query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1]])
 
-    index = IVFFlatIndex(uri=index_uri)
-    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1]])
+    update_vectors = np.empty([3], dtype=object)
+    update_vectors[0] = np.array([6.0, 6.1, 6.2, 6.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([7.0, 7.1, 7.2, 7.3], dtype=np.dtype(np.float32))
+    update_vectors[2] = np.array([8.0, 8.1, 8.2, 8.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1000, 1001, 1002]))
+    
+    index = index.consolidate_updates()
+
+    queries = np.array([update_vectors[2]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1002]])
+
+    ################################################################################################
+    # Test we can load the index again and query, update, and consolidate.
+    ################################################################################################
+    index_ram = IVFFlatIndex(uri=index_uri)
+
+    queries = np.array([data[1]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1]])
+
+    update_vectors = np.empty([2], dtype=object)
+    update_vectors[0] = np.array([9.0, 9.1, 9.2, 9.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([10.0, 10.1, 10.2, 10.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1003, 1004]))
+    index_ram = index_ram.consolidate_updates()
+
+    queries = np.array([update_vectors[0]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1003]])
+
+    update_vectors = np.empty([2], dtype=object)
+    update_vectors[0] = np.array([11.0, 11.1, 11.2, 11.3], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([12.0, 12.1, 12.2, 12.3], dtype=np.dtype(np.float32))
+    index.update_batch(vectors=update_vectors, external_ids=np.array([1003, 1004]))
+    index_ram = index_ram.consolidate_updates(retrain_index=True, training_sample_size=3)
+
+    queries = np.array([update_vectors[0]], dtype=np.float32)
+    query_and_check_equals(index=index, queries=queries, expected_result_d=[[0]], expected_result_i=[[1003]])