Allow setting IVF PQ partitions when re-ingesting, fix IVF PQ object index tests (#453)

jparismorgan · web-flow · commit 62ff91a9dd5f · 2024-07-19T12:56:01.000+02:00
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -450,6 +450,9 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
         are added to the index. It triggers a base index re-indexing, merging the non-consolidated
         updates and the rest of the base vectors.
 
+        TODO(sc-51202): This throws with a unintuitive error message if update()/delete()/etc. has
+        not been called.
+
         Parameters
         ----------
         retrain_index: bool
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -1557,6 +1557,7 @@ def ingest_type_erased(
         dimensions: int,
         size: int,
         batch: int,
+        partitions: int,
         config: Optional[Mapping[str, Any]] = None,
         verbose: bool = False,
         trace_id: Optional[str] = None,
@@ -1671,16 +1672,17 @@ def ingest_type_erased(
         from tiledb.vector_search import _tiledbvspy as vspy
 
         ctx = vspy.Ctx(config)
+        data = vspy.FeatureVectorArray(
+            ctx, parts_array_uri, ids_array_uri, 0, to_temporal_policy(index_timestamp)
+        )
         if index_type == "VAMANA":
             index = vspy.IndexVamana(ctx, index_group_uri)
+            index.train(data)
         elif index_type == "IVF_PQ":
             index = vspy.IndexIVFPQ(ctx, index_group_uri)
+            index.train(data, partitions)
         else:
             raise ValueError(f"Unsupported index type: {index_type}")
-        data = vspy.FeatureVectorArray(
-            ctx, parts_array_uri, ids_array_uri, 0, to_temporal_policy(index_timestamp)
-        )
-        index.train(data)
         index.add(data)
         index.write_index(ctx, index_group_uri, to_temporal_policy(index_timestamp))
 
@@ -2270,6 +2272,7 @@ def scale_resources(min_resource, max_resource, max_input_size, input_size):
                 dimensions=dimensions,
                 size=size,
                 batch=input_vectors_batch_size,
+                partitions=partitions,
                 config=config,
                 verbose=verbose,
                 trace_id=trace_id,
diff --git a/apis/python/src/tiledb/vector_search/object_api/object_index.py b/apis/python/src/tiledb/vector_search/object_api/object_index.py
@@ -125,9 +125,9 @@ def query(
         self,
         query_objects: np.ndarray,
         k: int,
-        query_metadata: OrderedDict = None,
-        metadata_array_cond: str = None,
-        metadata_df_filter_fn: str = None,
+        query_metadata: Optional[OrderedDict] = None,
+        metadata_array_cond: Optional[str] = None,
+        metadata_df_filter_fn: Optional[str] = None,
         return_objects: bool = True,
         return_metadata: bool = True,
         **kwargs,
diff --git a/apis/python/src/tiledb/vector_search/type_erased_module.cc b/apis/python/src/tiledb/vector_search/type_erased_module.cc
@@ -480,10 +480,11 @@ void init_type_erased_module(py::module_& m) {
           })
       .def(
           "train",
-          [](IndexIVFPQ& index, const FeatureVectorArray& vectors) {
-            index.train(vectors);
-          },
-          py::arg("vectors"))
+          [](IndexIVFPQ& index,
+             const FeatureVectorArray& vectors,
+             std::optional<size_t> nlist) { index.train(vectors, nlist); },
+          py::arg("vectors"),
+          py::arg("nlist") = std::nullopt)
       .def(
           "add",
           [](IndexIVFPQ& index, const FeatureVectorArray& vectors) {
diff --git a/apis/python/test/test_object_index.py b/apis/python/test/test_object_index.py
@@ -144,25 +144,64 @@ def read_objects_by_external_ids(self, ids: List[int]) -> OrderedDict:
         return {"object": objects, "external_id": external_ids}
 
 
-def evaluate_query(index_uri, query_kwargs, dim_id, vector_dim_offset, config=None):
+def assert_equal(
+    index_type: str,
+    ids: np.array,
+    expected_ids: np.array,
+    ivf_pq_accuracy_threshold: float,
+):
+    """
+    IVF_PQ index has a lower recall rate than other indexes b/c of PQ-encoding, so we need to lower
+    the threshold.
+
+    Parameters
+    ----------
+    index_type: str
+        The index type.
+    ids: np.array
+        The ids returned by the query.
+    expected_ids: np.array
+        The expected ids.
+    ivf_pq_accuracy_threshold: float
+        The minimum fraction of expected_ids that must be in ids.
+    """
+    assert len(ids) == len(expected_ids)
+    if index_type == "IVF_PQ":
+        matches = np.intersect1d(ids, expected_ids)
+        assert len(matches) / len(ids) >= ivf_pq_accuracy_threshold
+        return
+
+    assert np.array_equiv(ids, expected_ids)
+
+
+def evaluate_query(
+    index_type: str, index_uri, query_kwargs, dim_id, vector_dim_offset, config=None
+):
     v_id = dim_id - vector_dim_offset
+
     index = object_index.ObjectIndex(uri=index_uri, config=config)
     distances, objects, metadata = index.query(
-        {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=5, **query_kwargs
+        {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=21, **query_kwargs
     )
-    assert np.array_equiv(
+    assert_equal(
+        index_type,
         np.unique(objects["external_id"]),
-        np.array([v_id - 2, v_id - 1, v_id, v_id + 1, v_id + 2]),
+        np.array([v_id + i for i in range(-10, 11)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
+
     distances, object_ids = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
-        k=5,
+        k=21,
         return_objects=False,
         return_metadata=False,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        np.unique(object_ids), np.array([v_id - 2, v_id - 1, v_id, v_id + 1, v_id + 2])
+    assert_equal(
+        index_type,
+        np.unique(object_ids),
+        np.array([v_id + i for i in range(-10, 11)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
     def df_filter(row):
@@ -171,66 +210,84 @@ def df_filter(row):
     distances, objects, metadata = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
         metadata_df_filter_fn=df_filter,
-        k=5,
+        k=21,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        objects["external_id"], np.array([v_id, v_id + 1, v_id + 2, v_id + 3, v_id + 4])
+    assert_equal(
+        index_type,
+        np.unique(objects["external_id"]),
+        np.array([v_id + i for i in range(0, 21)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
     distances, object_ids = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
         metadata_df_filter_fn=df_filter,
-        k=5,
+        k=21,
         return_objects=False,
         return_metadata=False,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        object_ids, np.array([v_id, v_id + 1, v_id + 2, v_id + 3, v_id + 4])
+    assert_equal(
+        index_type,
+        np.unique(object_ids),
+        np.array([v_id + i for i in range(0, 21)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
     index = object_index.ObjectIndex(
         uri=index_uri, load_metadata_in_memory=False, config=config
     )
     distances, objects, metadata = index.query(
-        {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=5, **query_kwargs
+        {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])}, k=21, **query_kwargs
     )
-    assert np.array_equiv(
+    assert_equal(
+        index_type,
         np.unique(objects["external_id"]),
-        np.array([v_id - 2, v_id - 1, v_id, v_id + 1, v_id + 2]),
+        np.array([v_id + i for i in range(-10, 11)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
+
     distances, object_ids = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
-        k=5,
+        k=21,
         return_objects=False,
         return_metadata=False,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        np.unique(object_ids), np.array([v_id - 2, v_id - 1, v_id, v_id + 1, v_id + 2])
+    assert_equal(
+        index_type,
+        np.unique(object_ids),
+        np.array([v_id + i for i in range(-10, 11)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
     distances, objects, metadata = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
         metadata_array_cond=f"test_attr >= {dim_id}",
-        k=5,
+        k=21,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        objects["external_id"], np.array([v_id, v_id + 1, v_id + 2, v_id + 3, v_id + 4])
+    assert_equal(
+        index_type,
+        np.unique(objects["external_id"]),
+        np.array([v_id + i for i in range(0, 21)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
     distances, object_ids = index.query(
         {"object": np.array([[dim_id, dim_id, dim_id, dim_id]])},
         metadata_array_cond=f"test_attr >= {dim_id}",
-        k=5,
+        k=21,
         return_objects=False,
         return_metadata=False,
         **query_kwargs,
     )
-    assert np.array_equiv(
-        object_ids, np.array([v_id, v_id + 1, v_id + 2, v_id + 3, v_id + 4])
+    assert_equal(
+        index_type,
+        np.unique(object_ids),
+        np.array([v_id + i for i in range(0, 21)]),
+        ivf_pq_accuracy_threshold=0.8,
     )
 
 
@@ -256,12 +313,8 @@ def test_object_index(tmp_path):
 
         # Check initial ingestion
         index.update_index(partitions=10)
-
-        # TODO(SC-48908): Fix IVF_PQ with object index queries and remove.
-        if index_type == "IVF_PQ":
-            continue
-
         evaluate_query(
+            index_type=index_type,
             index_uri=index_uri,
             query_kwargs={"nprobe": 10, "l_search": 250},
             dim_id=42,
@@ -272,6 +325,7 @@ def test_object_index(tmp_path):
         index = object_index.ObjectIndex(uri=index_uri)
         index.update_index(partitions=10)
         evaluate_query(
+            index_type=index_type,
             index_uri=index_uri,
             query_kwargs={"nprobe": 10, "l_search": 500},
             dim_id=42,
@@ -288,6 +342,7 @@ def test_object_index(tmp_path):
         index.update_object_reader(reader)
         index.update_index(partitions=10)
         evaluate_query(
+            index_type=index_type,
             index_uri=index_uri,
             query_kwargs={"nprobe": 10, "l_search": 500},
             dim_id=1042,
@@ -304,6 +359,7 @@ def test_object_index(tmp_path):
         index.update_object_reader(reader)
         index.update_index(partitions=10)
         evaluate_query(
+            index_type=index_type,
             index_uri=index_uri,
             query_kwargs={"nprobe": 10, "l_search": 500},
             dim_id=2042,
@@ -351,6 +407,7 @@ def test_object_index_ivf_flat_cloud(tmp_path):
         config=config,
     )
     evaluate_query(
+        index_type="IVF_FLAT",
         index_uri=index_uri,
         query_kwargs={"nprobe": 10},
         dim_id=42,
@@ -381,6 +438,7 @@ def test_object_index_ivf_flat_cloud(tmp_path):
         config=config,
     )
     evaluate_query(
+        index_type="IVF_FLAT",
         index_uri=index_uri,
         query_kwargs={"nprobe": 10},
         dim_id=1042,
@@ -409,6 +467,7 @@ def test_object_index_flat(tmp_path):
     # Check initial ingestion
     index.update_index()
     evaluate_query(
+        index_type="FLAT",
         index_uri=index_uri,
         query_kwargs={},
         dim_id=42,
@@ -419,6 +478,7 @@ def test_object_index_flat(tmp_path):
     index = object_index.ObjectIndex(uri=index_uri)
     index.update_index()
     evaluate_query(
+        index_type="FLAT",
         index_uri=index_uri,
         query_kwargs={},
         dim_id=42,
@@ -435,6 +495,7 @@ def test_object_index_flat(tmp_path):
     index.update_object_reader(reader)
     index.update_index()
     evaluate_query(
+        index_type="FLAT",
         index_uri=index_uri,
         query_kwargs={},
         dim_id=1042,
@@ -451,6 +512,7 @@ def test_object_index_flat(tmp_path):
     index.update_object_reader(reader)
     index.update_index()
     evaluate_query(
+        index_type="FLAT",
         index_uri=index_uri,
         query_kwargs={},
         dim_id=2042,
diff --git a/src/include/api/ivf_pq_index.h b/src/include/api/ivf_pq_index.h
@@ -164,11 +164,14 @@ class IndexIVFPQ {
 
   /**
    * @brief Train the index based on the given training set.
-   * @param training_set
-   * @param init
+   * @param training_set The training input vectors.
+   * @param n_list The number of clusters to use in the index. Can be passed to
+   * override the value we used when we first created the index.
    */
   // @todo -- infer feature type from input
-  void train(const FeatureVectorArray& training_set) {
+  void train(
+      const FeatureVectorArray& training_set,
+      std::optional<size_t> n_list = std::nullopt) {
     if (feature_datatype_ == TILEDB_ANY) {
       feature_datatype_ = training_set.feature_type();
     } else if (feature_datatype_ != training_set.feature_type()) {
@@ -184,6 +187,10 @@ class IndexIVFPQ {
       throw std::runtime_error("Unsupported datatype combination");
     }
 
+    if (n_list.has_value()) {
+      n_list_ = *n_list;
+    }
+
     // Create a new index. Note that we may have already loaded an existing
     // index by URI. In that case, we have updated our local state (i.e.
     // num_subspaces_, etc.), but we should also use the timestamp from that
diff --git a/src/include/api/vamana_index.h b/src/include/api/vamana_index.h
@@ -59,7 +59,7 @@
  *
  * We support all combinations of the following types for feature, id, and px
  * datatypes:
- *   - feature_type: uint8 or float
+ *   - feature_type: uint8, int8, or float
  *   - id_type: uint32 or uint64
  *   - adjacency_row_index_type: uint32 or uint64
  */
diff --git a/src/include/index/ivf_pq_index.h b/src/include/index/ivf_pq_index.h
@@ -713,7 +713,6 @@ class ivf_pq_index {
    * @param training_set_ids IDs for each vector.
    *
    * @todo Create and write index that is larger than RAM
-   * @todo Use training_set_ids as the external IDs.
    */
   template <
       feature_vector_array Array,

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@`
`59`	`59`	`*`
`60`	`60`	`* We support all combinations of the following types for feature, id, and px`
`61`	`61`	`* datatypes:`
`62`		`- * - feature_type: uint8 or float`
	`62`	`+ * - feature_type: uint8, int8, or float`
`63`	`63`	`* - id_type: uint32 or uint64`
`64`	`64`	`* - adjacency_row_index_type: uint32 or uint64`
`65`	`65`	`*/`
Original file line number	Diff line number	Diff line change
`@@ -713,7 +713,6 @@ class ivf_pq_index {`
`713`	`713`	`* @param training_set_ids IDs for each vector.`
`714`	`714`	`*`
`715`	`715`	`* @todo Create and write index that is larger than RAM`
`716`		`- * @todo Use training_set_ids as the external IDs.`
`717`	`716`	`*/`
`718`	`717`	`template <`
`719`	`718`	`feature_vector_array Array,`