Type erased Vamana index (#285)

jparismorgan · web-flow · commit 6ff2a8667dcc · 2024-04-05T13:31:15.000+02:00
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
diff --git a/apis/python/src/tiledb/vector_search/vamana_index.py b/apis/python/src/tiledb/vector_search/vamana_index.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Mapping
 
 import numpy as np
@@ -62,6 +63,8 @@ def query_internal(
         self,
         queries: np.ndarray,
         k: int = 10,
+        opt_l: Optional[int] = 1,
+        **kwargs,
     ):
         """
         Query an VAMANA index
@@ -72,7 +75,10 @@ def query_internal(
             ND Array of queries
         k: int
             Number of top results to return per query
+        opt_l: int
+            How deep to search
         """
+        warnings.warn("The Vamana index is not yet supported, please use with caution.")
         if self.size == 0:
             return np.full((queries.shape[0], k), index.MAX_FLOAT_32), np.full(
                 (queries.shape[0], k), index.MAX_UINT64
@@ -83,8 +89,10 @@ def query_internal(
         if queries.ndim == 1:
             queries = np.array([queries])
 
-        # TODO(paris): Actually run the query.
-        return [], []
+        queries_feature_vector_array = vspy.FeatureVectorArray(np.transpose(queries))
+        distances, ids = self.index.query(queries_feature_vector_array, k, opt_l)
+
+        return np.array(distances, copy=False), np.array(ids, copy=False)
 
 
 # TODO(paris): Pass more arguments to C++, i.e. storage_version.
@@ -94,24 +102,23 @@ def create(
     vector_type: np.dtype,
     id_type: np.dtype = np.uint32,
     adjacency_row_index_type: np.dtype = np.uint32,
-    group_exists: bool = False,
     config: Optional[Mapping[str, Any]] = None,
     storage_version: str = STORAGE_VERSION,
     **kwargs,
 ) -> VamanaIndex:
-    if not group_exists:
-        ctx = vspy.Ctx(config)
-        index = vspy.IndexVamana(
-            feature_type=np.dtype(vector_type).name,
-            id_type=np.dtype(id_type).name,
-            adjacency_row_index_type=np.dtype(adjacency_row_index_type).name,
-            dimension=dimensions,
-        )
-        # TODO(paris): Run all of this with a single C++ call.
-        empty_vector = vspy.FeatureVectorArray(
-            dimensions, 0, np.dtype(vector_type).name, np.dtype(id_type).name
-        )
-        index.train(empty_vector)
-        index.add(empty_vector)
-        index.write_index(ctx, uri)
+    warnings.warn("The Vamana index is not yet supported, please use with caution.")
+    ctx = vspy.Ctx(config)
+    index = vspy.IndexVamana(
+        feature_type=np.dtype(vector_type).name,
+        id_type=np.dtype(id_type).name,
+        adjacency_row_index_type=np.dtype(adjacency_row_index_type).name,
+        dimension=dimensions,
+    )
+    # TODO(paris): Run all of this with a single C++ call.
+    empty_vector = vspy.FeatureVectorArray(
+        dimensions, 0, np.dtype(vector_type).name, np.dtype(id_type).name
+    )
+    index.train(empty_vector)
+    index.add(empty_vector)
+    index.write_index(ctx, uri)
     return VamanaIndex(uri=uri, config=config, memory_budget=1000000)
diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py
@@ -19,6 +19,15 @@
 from tiledb.vector_search.vamana_index import VamanaIndex
 
 
+def query_and_check_distances(
+    index, queries, k, expected_distances, expected_ids, **kwargs
+):
+    for _ in range(1):
+        distances, ids = index.query(queries, k=k, **kwargs)
+        assert np.array_equal(ids, expected_ids)
+        assert np.array_equal(distances, expected_distances)
+
+
 def query_and_check(index, queries, k, expected, **kwargs):
     for _ in range(3):
         result_d, result_i = index.query(queries, k=k, **kwargs)
@@ -167,7 +176,7 @@ def test_ivf_flat_index(tmp_path):
     )
 
 
-def test_vamana_index(tmp_path):
+def test_vamana_index_simple(tmp_path):
     uri = os.path.join(tmp_path, "array")
     dimensions = 3
     vector_type = np.dtype(np.uint8)
@@ -188,14 +197,68 @@ def test_vamana_index(tmp_path):
     query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {ind.MAX_UINT64})
 
 
+def test_vamana_index(tmp_path):
+    uri = os.path.join(tmp_path, "array")
+    if os.path.exists(uri):
+        os.rmdir(uri)
+    vector_type = np.float32
+
+    index = vamana_index.create(
+        uri=uri,
+        dimensions=3,
+        vector_type=np.dtype(vector_type),
+        id_type=np.dtype(np.uint32),
+    )
+
+    queries = np.array([[2, 2, 2]], dtype=np.float32)
+    distances, ids = index.query(queries, k=1)
+    assert distances.shape == (1, 1)
+    assert ids.shape == (1, 1)
+    assert distances[0][0] == ind.MAX_FLOAT_32
+    assert ids[0][0] == ind.MAX_UINT64
+    query_and_check_distances(
+        index, queries, 1, [[ind.MAX_FLOAT_32]], [[ind.MAX_UINT64]]
+    )
+
+    update_vectors = np.empty([5], dtype=object)
+    update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.float32))
+    update_vectors[1] = np.array([1, 1, 1], dtype=np.dtype(np.float32))
+    update_vectors[2] = np.array([2, 2, 2], dtype=np.dtype(np.float32))
+    update_vectors[3] = np.array([3, 3, 3], dtype=np.dtype(np.float32))
+    update_vectors[4] = np.array([4, 4, 4], dtype=np.dtype(np.float32))
+    index.update_batch(
+        vectors=update_vectors,
+        external_ids=np.array([0, 1, 2, 3, 4], dtype=np.dtype(np.uint32)),
+    )
+    query_and_check_distances(
+        index, np.array([[2, 2, 2]], dtype=np.float32), 2, [[0, 3]], [[2, 1]]
+    )
+
+    index = index.consolidate_updates()
+
+    # TODO(paris): Does not work with k > 1 or with [0, 0, 0] as the query.
+    query_and_check_distances(
+        index, np.array([[1, 1, 1]], dtype=np.float32), 1, [[0]], [[1]]
+    )
+    query_and_check_distances(
+        index, np.array([[2, 2, 2]], dtype=np.float32), 1, [[0]], [[2]]
+    )
+    query_and_check_distances(
+        index, np.array([[3, 3, 3]], dtype=np.float32), 1, [[0]], [[3]]
+    )
+    query_and_check_distances(
+        index, np.array([[4, 4, 4]], dtype=np.float32), 1, [[0]], [[4]]
+    )
+
+
 def test_delete_invalid_index(tmp_path):
     # We don't throw with an invalid uri.
     Index.delete_index(uri="invalid_uri", config=tiledb.cloud.Config())
 
 
 def test_delete_index(tmp_path):
-    indexes = ["FLAT", "IVF_FLAT"]
-    index_classes = [FlatIndex, IVFFlatIndex]
+    indexes = ["FLAT", "IVF_FLAT", "VAMANA"]
+    index_classes = [FlatIndex, IVFFlatIndex, VamanaIndex]
     data = np.array([[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]], dtype=np.float32)
     for index_type, index_class in zip(indexes, index_classes):
         index_uri = os.path.join(tmp_path, f"array_{index_type}")
@@ -229,7 +292,7 @@ def test_index_with_incorrect_dimensions(tmp_path):
 def test_index_with_incorrect_num_of_query_columns_simple(tmp_path):
     siftsmall_uri = siftsmall_inputs_file
     queries_uri = siftsmall_query_file
-    indexes = ["FLAT", "IVF_FLAT"]
+    indexes = ["FLAT", "IVF_FLAT", "VAMANA"]
     for index_type in indexes:
         index_uri = os.path.join(tmp_path, f"sift10k_flat_{index_type}")
         index = ingest(
@@ -253,7 +316,7 @@ def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
     # Tests that we raise a TypeError if the number of columns in the query is not the same as the
     # number of columns in the indexed data.
     size = 1000
-    indexes = ["FLAT", "IVF_FLAT"]
+    indexes = ["FLAT", "IVF_FLAT", "VAMANA"]
     num_columns_in_vector = [1, 2, 3, 4, 5, 10]
     for index_type in indexes:
         for num_columns in num_columns_in_vector:
@@ -298,7 +361,7 @@ def test_index_with_incorrect_num_of_query_columns_in_single_vector_query(tmp_pa
     # Tests that we raise a TypeError if the number of columns in the query is not the same as the
     # number of columns in the indexed data, specifically for a single vector query.
     # i.e. queries = [1, 2, 3]  instead of queries = [[1, 2, 3], [4, 5, 6]].
-    indexes = [flat_index, ivf_flat_index]
+    indexes = [flat_index, ivf_flat_index, vamana_index]
     for index_type in indexes:
         uri = os.path.join(tmp_path, f"array_{index_type.__name__}")
         index = index_type.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8))
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -15,6 +15,7 @@
 from tiledb.vector_search.module import kmeans_fit
 from tiledb.vector_search.module import kmeans_predict
 from tiledb.vector_search.utils import load_fvecs
+from tiledb.vector_search.vamana_index import VamanaIndex
 
 MINIMUM_ACCURACY = 0.85
 MAX_UINT64 = np.iinfo(np.dtype("uint64")).max
@@ -30,6 +31,34 @@ def query_and_check_equals(index, queries, expected_result_d, expected_result_i)
     )
 
 
+def test_vamana_ingestion_u8(tmp_path):
+    dataset_dir = os.path.join(tmp_path, "dataset")
+    index_uri = os.path.join(tmp_path, "array")
+    if os.path.exists(index_uri):
+        shutil.rmtree(index_uri)
+    create_random_dataset_u8(nb=10000, d=100, nq=100, k=10, path=dataset_dir)
+    dtype = np.dtype(np.uint8)
+    k = 10
+
+    queries = get_queries(dataset_dir, dtype=dtype)
+    gt_i, gt_d = get_groundtruth(dataset_dir, k)
+
+    index = ingest(
+        index_type="VAMANA",
+        index_uri=index_uri,
+        source_uri=os.path.join(dataset_dir, "data.u8bin"),
+    )
+    _, result = index.query(queries, k=k)
+    # TODO(paris): Fix IDs and re-enable.
+    # assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
+    index_uri = move_local_index_to_new_location(index_uri)
+    index_ram = VamanaIndex(uri=index_uri)
+    _, result = index_ram.query(queries, k=k)
+    # TODO(paris): Fix IDs and re-enable.
+    # assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
+
 def test_flat_ingestion_u8(tmp_path):
     dataset_dir = os.path.join(tmp_path, "dataset")
     index_uri = os.path.join(tmp_path, "array")
diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py
@@ -5,7 +5,6 @@
 
 from tiledb.vector_search import _tiledbvspy as vspy
 
-# ctx = tiledb.Ctx()
 ctx = vspy.Ctx({})
 
 
@@ -187,6 +186,45 @@ def test_construct_IndexVamana():
     assert a.dimension() == 0
 
 
+def test_construct_IndexVamana_with_empty_vector(tmp_path):
+    opt_l = 100
+    k_nn = 10
+    index_uri = os.path.join(tmp_path, "array")
+    dimensions = 128
+    feature_type = "float32"
+    id_type = "uint64"
+    adjacency_row_index_type = "uint64"
+
+    # First create an empty index.
+    a = vspy.IndexVamana(
+        feature_type=feature_type,
+        id_type=id_type,
+        adjacency_row_index_type=adjacency_row_index_type,
+        dimension=dimensions,
+    )
+    empty_vector = vspy.FeatureVectorArray(dimensions, 0, feature_type, id_type)
+    a.train(empty_vector)
+    a.write_index(ctx, index_uri)
+
+    # Then load it again, retrain, and query.
+    a = vspy.IndexVamana(ctx, index_uri)
+    training_set = vspy.FeatureVectorArray(ctx, siftsmall_inputs_uri)
+    assert training_set.feature_type_string() == "float32"
+    query_set = vspy.FeatureVectorArray(ctx, siftsmall_query_uri)
+    assert query_set.feature_type_string() == "float32"
+    groundtruth_set = vspy.FeatureVectorArray(ctx, siftsmall_groundtruth_uri)
+    assert groundtruth_set.feature_type_string() == "uint64"
+
+    a.train(training_set)
+
+    s, t = a.query(query_set, k_nn, opt_l)
+
+    intersections = vspy.count_intersections(t, groundtruth_set, k_nn)
+    nt = np.double(t.num_vectors()) * np.double(k_nn)
+    recall = intersections / nt
+    assert recall == 1.0
+
+
 def test_inplace_build_query_IndexVamana():
     opt_l = 100
     k_nn = 10
diff --git a/src/include/detail/linalg/tdb_io.h b/src/include/detail/linalg/tdb_io.h
@@ -94,7 +94,7 @@ std::vector<T> read_vector_helper(
 
   // Create a subarray that reads the array up to the specified subset.
   std::vector<int32_t> subarray_vals = {
-      (int32_t)start_pos, (int32_t)end_pos - 1};
+      (int32_t)start_pos, std::max(0, (int32_t)end_pos - 1)};
   tiledb::Subarray subarray(ctx, *array_);
   subarray.set_subarray(subarray_vals);
 
@@ -136,9 +136,9 @@ void create_empty_for_matrix(
   tiledb::Domain domain(ctx);
   domain
       .add_dimension(tiledb::Dimension::create<int>(
-          ctx, "rows", {{0, (int)rows - 1}}, row_extent))
+          ctx, "rows", {{0, std::max(0, (int)rows - 1)}}, row_extent))
       .add_dimension(tiledb::Dimension::create<int>(
-          ctx, "cols", {{0, (int)cols - 1}}, col_extent));
+          ctx, "cols", {{0, std::max(0, (int)cols - 1)}}, col_extent));
 
   tiledb::ArraySchema schema(ctx, TILEDB_DENSE);
 
@@ -218,10 +218,9 @@ void write_matrix(
 
   std::vector<int32_t> subarray_vals{
       0,
-      (int)A.num_rows() - 1,
-      (int)start_pos,
-      (int)start_pos + (int)A.num_cols() - 1};
-
+      std::max(0, (int)A.num_rows() - 1),
+      std::max(0, (int)start_pos),
+      std::max(0, (int)start_pos + (int)A.num_cols() - 1)};
   // Open array for writing
   auto array = tiledb_helpers::open_array(
       tdb_func__, ctx, uri, TILEDB_WRITE, temporal_policy);
@@ -265,7 +264,7 @@ void create_empty_for_vector(
     std::optional<tiledb_filter_type_t> filter = std::nullopt) {
   tiledb::Domain domain(ctx);
   domain.add_dimension(tiledb::Dimension::create<int>(
-      ctx, "rows", {{0, (int)rows - 1}}, row_extent));
+      ctx, "rows", {{0, std::max(0, (int)rows - 1)}}, row_extent));
 
   // The array will be dense.
   tiledb::ArraySchema schema(ctx, TILEDB_DENSE);
diff --git a/src/include/index/ivf_flat_group.h b/src/include/index/ivf_flat_group.h
@@ -45,8 +45,6 @@
      {
          {"centroids_array_name", "partition_centroids"},
          {"index_array_name", "partition_indexes"},
-         {"ids_array_name", "shuffled_vector_ids"},
-         {"parts_array_name", "shuffled_vectors"},
      }}};
 
 template <class Index>
diff --git a/src/include/index/vamana_group.h b/src/include/index/vamana_group.h
@@ -37,13 +37,26 @@
 #include "index/vamana_metadata.h"
 
 /**
- * The vamana index group needs to store
- *   * vectors
- *   * graph (basically CSR)
- *     * neighbor lists
- *     * neighbor scores (distances)
- *     * "row" index
- *   * centroids (for the case of partitioned vamana)
+ * The vamana index group stores:
+ * - feature_vectors: the original set of vectors which we copy.
+ *   - Example: [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
+ * - feature_vectors_ids: the IDs of the vectors in feature_vectors_array_name.
+ *   - Example: [99, 100, 101]
+ * - The graph (basically a CSR)
+ *   - adjacency_ids: These are indexes into feature_vectors. Vertices go from 0
+ * -> n-1 and each of those vertices indexes into feature_vectors. Then those
+ * IDs correspond to the indexes. You can also think of it as holding the R
+ * nearest neighbhors in the graph for each vertex.
+ *      - Example: Here we have 100 and 101 connected, 99 and 101 connected, and
+ * 99 and 10 connected. Logically you can think of it like: [[1 2], [0, 2], [0,
+ * 1]], but it's stored as [1, 2, 0, 2, 0, 1]
+ *   - adjacency_scores: This holds the neighbor scores (i.e. the distances)
+ *      - Example: [[distance between 0 and 1, distance between 0 and 2], etc.]
+ *   -  adjacency_row_index: Each entry in the row index indicates where the
+ * neighbhors for that index start. 0 because that's where neighbors for vertex
+ * 0 start, then 2 b/c that's where niehbhors for vertex 1 start, then 4 b/c
+ * that's whre niehbhors for vertex 2 start, then 6 b/c that's the end.
+ *      - Example: [0, 2, 4, 6]
  */
 [[maybe_unused]] static StorageFormat vamana_storage_formats = {
     {"0.3",
diff --git a/src/include/test/unit_api_feature_vector_array.cc b/src/include/test/unit_api_feature_vector_array.cc
diff --git a/src/include/test/unit_api_vamana_index.cc b/src/include/test/unit_api_vamana_index.cc

Original file line number	Diff line number	Diff line change
`@@ -45,8 +45,6 @@`
`45`	`45`	`{`
`46`	`46`	`{"centroids_array_name", "partition_centroids"},`
`47`	`47`	`{"index_array_name", "partition_indexes"},`
`48`		`- {"ids_array_name", "shuffled_vector_ids"},`
`49`		`- {"parts_array_name", "shuffled_vectors"},`
`50`	`48`	`}}};`
`51`	`49`
`52`	`50`	`template <class Index>`