Expose Vamana graph building params (#423)

jparismorgan · web-flow · commit 6d908cea6695 · 2024-07-01T14:35:20.000+02:00
diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py
@@ -52,6 +52,8 @@ def ingest(
     size: int = -1,
     partitions: int = -1,
     num_subspaces: int = -1,
+    l_build: int = -1,
+    r_max_degree: int = -1,
     training_sampling_policy: TrainingSamplingPolicy = TrainingSamplingPolicy.FIRST_N,
     copy_centroids_uri: str = None,
     training_sample_size: int = -1,
@@ -120,6 +122,12 @@ def ingest(
         For PQ encoded indexes, the number of subspaces to use in the PQ encoding. We will divide the dimensions into
         num_subspaces parts, and PQ encode each part separately. This means dimensions must
         be divisible by num_subspaces.
+    l_build: int
+        For Vamana indexes, the number of neighbors considered for each node during construction of the graph. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. l_build should be >= r_max_degree unless you need to build indices quickly and can compromise on quality.
+        Typically between 75 and 200. If not provided, use the default value of 100.
+    r_max_degree: int
+        For Vamana indexes, the maximum degree for each node in the final graph. Larger values will result in larger indices and longer indexing times, but better search quality.
+        Typically between 60 and 150. If not provided, use the default value of 64.
     copy_centroids_uri: str
         TileDB array URI to copy centroids from, if not provided, centroids are build running `k-means`.
     training_sample_size: int
@@ -2671,6 +2679,8 @@ def consolidate_and_vacuum(
                             dimensions=dimensions,
                             vector_type=vector_type,
                             config=config,
+                            l_build=l_build,
+                            r_max_degree=r_max_degree,
                             storage_version=storage_version,
                         )
                     elif index_type == "IVF_PQ":
diff --git a/apis/python/src/tiledb/vector_search/type_erased_module.cc b/apis/python/src/tiledb/vector_search/type_erased_module.cc
@@ -365,6 +365,8 @@ void init_type_erased_module(py::module_& m) {
       .def("feature_type_string", &IndexVamana::feature_type_string)
       .def("id_type_string", &IndexVamana::id_type_string)
       .def("dimensions", &IndexVamana::dimensions)
+      .def("l_build", &IndexVamana::l_build)
+      .def("r_max_degree", &IndexVamana::r_max_degree)
       .def_static(
           "clear_history",
           [](const tiledb::Context& ctx,
diff --git a/apis/python/src/tiledb/vector_search/vamana_index.py b/apis/python/src/tiledb/vector_search/vamana_index.py
@@ -27,6 +27,10 @@
 
 INDEX_TYPE = "VAMANA"
 
+L_BUILD_DEFAULT = 100
+R_MAX_DEGREE_DEFAULT = 64
+L_SEARCH_DEFAULT = 100
+
 
 class VamanaIndex(index.Index):
     """
@@ -97,7 +101,7 @@ def query_internal(
         self,
         queries: np.ndarray,
         k: int = 10,
-        l_search: Optional[int] = 100,
+        l_search: Optional[int] = L_SEARCH_DEFAULT,
         **kwargs,
     ):
         """
@@ -110,7 +114,8 @@ def query_internal(
         k: int
             Number of results to return per query vector.
         l_search: int
-            How deep to search. Should be >= k, and if it's not, we will set it to k.
+            How deep to search. Larger parameters will result in slower latencies, but higher accuracies.
+            Should be >= k, and if it's not, we will set it to k.
         """
         if self.size == 0:
             return np.full((queries.shape[0], k), MAX_FLOAT32), np.full(
@@ -137,6 +142,8 @@ def create(
     uri: str,
     dimensions: int,
     vector_type: np.dtype,
+    l_build: int = L_BUILD_DEFAULT,
+    r_max_degree: int = R_MAX_DEGREE_DEFAULT,
     config: Optional[Mapping[str, Any]] = None,
     storage_version: str = STORAGE_VERSION,
     **kwargs,
@@ -152,6 +159,12 @@ def create(
     vector_type: np.dtype
         Datatype of vectors.
         Supported values (uint8, int8, float32).
+    l_build: int
+        The number of neighbors considered for each node during construction of the graph. Larger values will take more time to build but result in indices that provide higher recall for the same search complexity. l_build should be >= r_max_degree unless you need to build indices quickly and can compromise on quality.
+        Typically between 75 and 200. If not provided, use the default value of 100.
+    r_max_degree: int
+        The maximum degree for each node in the final graph. Larger values will result in larger indices and longer indexing times, but better search quality.
+        Typically between 60 and 150. If not provided, use the default value of 64.
     config: Optional[Mapping[str, Any]]
         TileDB config dictionary.
     storage_version: str
@@ -169,6 +182,8 @@ def create(
         feature_type=np.dtype(vector_type).name,
         id_type=np.dtype(np.uint64).name,
         dimensions=dimensions,
+        l_build=l_build if l_build > 0 else L_BUILD_DEFAULT,
+        r_max_degree=r_max_degree if l_build > 0 else R_MAX_DEGREE_DEFAULT,
     )
     # TODO(paris): Run all of this with a single C++ call.
     empty_vector = vspy.FeatureVectorArray(
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -7,6 +7,7 @@
 from common import load_metadata
 
 from tiledb.cloud.dag import Mode
+from tiledb.vector_search import _tiledbvspy as vspy
 from tiledb.vector_search.index import Index
 from tiledb.vector_search.ingestion import TrainingSamplingPolicy
 from tiledb.vector_search.ingestion import ingest
@@ -40,7 +41,11 @@ def test_vamana_ingestion_u8(tmp_path):
     index_uri = os.path.join(tmp_path, "array")
     if os.path.exists(index_uri):
         shutil.rmtree(index_uri)
-    create_random_dataset_u8(nb=10000, d=100, nq=100, k=10, path=dataset_dir)
+
+    l_build = 101
+    r_max_degree = 65
+    dimensions = 100
+    create_random_dataset_u8(nb=10000, d=dimensions, nq=100, k=10, path=dataset_dir)
     dtype = np.dtype(np.uint8)
     k = 10
 
@@ -51,7 +56,18 @@ def test_vamana_ingestion_u8(tmp_path):
         index_type="VAMANA",
         index_uri=index_uri,
         source_uri=os.path.join(dataset_dir, "data.u8bin"),
+        l_build=l_build,
+        r_max_degree=r_max_degree,
     )
+
+    # This is not a public API, but we directly load the C++ type-erased index to test it. If you
+    # are a library user, you should not do this yourself, as the API may change.
+    ctx = vspy.Ctx({})
+    type_erased_index = vspy.IndexVamana(ctx, index_uri, None)
+    assert type_erased_index.dimensions() == dimensions
+    assert type_erased_index.l_build() == l_build
+    assert type_erased_index.r_max_degree() == r_max_degree
+
     _, result = index.query(queries, k=k)
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py
@@ -283,6 +283,18 @@ def test_construct_IndexVamana():
     assert a.id_type_string() == "int64"
     assert a.dimensions() == 0
 
+    a = vspy.IndexVamana(feature_type="float32", id_type="int64", l_build=11)
+    assert a.l_build() == 11
+
+    a = vspy.IndexVamana(feature_type="float32", id_type="int64", r_max_degree=22)
+    assert a.r_max_degree() == 22
+
+    a = vspy.IndexVamana(
+        feature_type="float32", id_type="int64", l_build=11, r_max_degree=22
+    )
+    assert a.l_build() == 11
+    assert a.r_max_degree() == 22
+
 
 def test_construct_IndexVamana_with_empty_vector(tmp_path):
     l_search = 100
@@ -291,12 +303,16 @@ def test_construct_IndexVamana_with_empty_vector(tmp_path):
     dimensions = 128
     feature_type = "float32"
     id_type = "uint64"
+    l_build = 100
+    r_max_degree = 101
 
     # First create an empty index.
     a = vspy.IndexVamana(
         feature_type=feature_type,
         id_type=id_type,
         dimensions=dimensions,
+        l_build=l_build,
+        r_max_degree=r_max_degree,
     )
     empty_vector = vspy.FeatureVectorArray(dimensions, 0, feature_type, id_type)
     a.train(empty_vector)
@@ -310,6 +326,8 @@ def test_construct_IndexVamana_with_empty_vector(tmp_path):
     assert query_set.feature_type_string() == "float32"
     groundtruth_set = vspy.FeatureVectorArray(ctx, siftsmall_groundtruth_uri)
     assert groundtruth_set.feature_type_string() == "uint64"
+    assert a.l_build() == l_build
+    assert a.r_max_degree() == r_max_degree
 
     a.train(training_set)