Distributed query implementation

Nikos Papailiou · Nikos Papailiou · commit f4eac93d8a07 · 2023-07-11T16:03:42.000+03:00
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -3,12 +3,18 @@
 
 import numpy as np
 from tiledb.vector_search.module import *
+from tiledb.cloud.dag import Mode
 
 CENTROIDS_ARRAY_NAME = "centroids.tdb"
 INDEX_ARRAY_NAME = "index.tdb"
 IDS_ARRAY_NAME = "ids.tdb"
 PARTS_ARRAY_NAME = "parts.tdb"
 
+def submit_local(d, func, *args, **kwargs):
+    # Drop kwarg
+    kwargs.pop("image_name", None)
+    kwargs.pop("resources", None)
+    return d.submit_local(func, *args, **kwargs)
 
 class Index:
     def query(self, targets: np.ndarray, k=10, nqueries=10, nthreads=8, nprobe=1):
@@ -179,6 +185,7 @@ def distributed_query(
         nthreads=8,
         nprobe=1,
         num_nodes=5,
+        mode: Mode = Mode.REALTIME,
     ):
         """
         Distributed Query on top of an IVF_FLAT index
@@ -196,7 +203,64 @@ def distributed_query(
         nprobe: int
             number of probes
         """
-        assert targets.dtype == np.float32
+        from tiledb.cloud import dag
+        from tiledb.cloud.dag import Mode
+        from tiledb.vector_search.module import array_to_matrix, partition_ivf_index, dist_qv
+        import math
+        import numpy as np
+        from functools import partial
+
+        def dist_qv_udf(
+            dtype: np.dtype,
+            parts_uri: str,
+            ids_uri: str,
+            query_vectors: np.ndarray,
+            active_partitions: np.array,
+            active_queries: np.array,
+            indices: np.array,
+            k_nn: int):
+            targets_m = array_to_matrix(query_vectors)
+            r = dist_qv(
+                dtype=dtype,
+                parts_uri=parts_uri,
+                ids_uri=ids_uri,
+                query_vectors=targets_m,
+                active_partitions=active_partitions,
+                active_queries=active_queries,
+                indices=indices,
+                k_nn=k_nn,
+            )
+            results = []
+            for q in range(len(r)):
+                tmp_results = []
+                for j in range(len(r[q])):
+                    tmp_results.append(r[q][j])
+                results.append(tmp_results)
+            return results
+
+        assert targets.dtype == self.dtype
+        if mode == Mode.BATCH:
+            d = dag.DAG(
+                name="vector-query",
+                mode=Mode.BATCH,
+                max_workers=num_nodes,
+            )
+        if mode == Mode.REALTIME:
+            d = dag.DAG(
+                name="vector-query",
+                mode=Mode.REALTIME,
+                max_workers=num_nodes,
+            )
+        else:
+            d = dag.DAG(
+                name="vector-query",
+                mode=Mode.REALTIME,
+                max_workers=1,
+                namespace="default",
+            )
+        submit = partial(submit_local, d)
+        if mode == Mode.BATCH or mode == Mode.REALTIME:
+            submit = d.submit
 
         targets_m = array_to_matrix(targets)
         active_partitions, active_queries = partition_ivf_index(
@@ -207,29 +271,38 @@ def distributed_query(
         num_parts = len(active_partitions)
 
         parts_per_node = int(math.ceil(num_parts / num_nodes))
-        results = []
+        nodes = []
         for part in range(0, num_parts, parts_per_node):
             part_end = part + parts_per_node
             if part_end > num_parts:
                 part_end = num_parts
-            results.append(dist_qv(
+            nodes.append(submit(
+                dist_qv_udf,
                 dtype=self.dtype,
                 parts_uri=self.parts_db_uri,
                 ids_uri=self.ids_uri,
-                query_vectors=targets_m,
+                query_vectors=targets,
                 active_partitions=np.array(active_partitions)[part:part_end],
-                active_queries=np.array(active_queries[part:part_end]),
-                indices=self._index,
+                active_queries=np.array(active_queries[part:part_end], dtype=object),
+                indices=np.array(self._index),
                 k_nn=k,
-                ctx=self.ctx,
+                resource_class='large',
+                image_name="3.9-vectorsearch",
             ))
 
+        d.compute()
+        d.wait()
+        results = []
+        for node in nodes:
+            res = node.result()
+            results.append(res)
+
         results_per_query = []
         for q in range(targets.shape[1]):
             tmp_results = []
             for j in range(k):
                 for r in results:
-                    if len(r[q]) > 0:
+                    if len(r[q]) > j:
                         if r[q][j][0] > 0:
                             tmp_results.append(r[q][j])
             results_per_query.append(sorted(tmp_results, key=lambda t: t[0])[0:k])
diff --git a/apis/python/src/tiledb/vector_search/module.cc b/apis/python/src/tiledb/vector_search/module.cc
@@ -348,8 +348,8 @@ void declareStdVector(py::module& m, const std::string& suffix) {
 template <typename T, typename indices_type = size_t>
 void declarePartitionIvfIndex(py::module& m, const std::string& suffix) {
   m.def(("partition_ivf_index_" + suffix).c_str(),
-        [](ColMajorMatrix<T>& centroids,
-           ColMajorMatrix<float>& query,
+        [](ColMajorMatrix<float>& centroids,
+           ColMajorMatrix<T>& query,
            size_t nprobe,
            size_t nthreads) {
           return detail::ivf::partition_ivf_index(centroids, query, nprobe, nthreads);
diff --git a/apis/python/src/tiledb/vector_search/module.py b/apis/python/src/tiledb/vector_search/module.py
@@ -295,9 +295,9 @@ def ivf_query(
 
 
 def partition_ivf_index(centroids, query, nprobe=1, nthreads=0):
-    if centroids.dtype == np.float32:
+    if query.dtype == np.float32:
         return partition_ivf_index_f32(centroids, query, nprobe, nthreads)
-    elif centroids.dtype == np.uint8:
+    elif query.dtype == np.uint8:
         return partition_ivf_index_u8(centroids, query, nprobe, nthreads)
     else:
         raise TypeError("Unsupported type!")
@@ -307,9 +307,9 @@ def dist_qv(
     parts_uri: str,
     ids_uri: str,
     query_vectors: "colMajorMatrix",
-    active_partitions: "Vector",
-    active_queries: "Vector",
-    indices: "Vector",
+    active_partitions: np.array,
+    active_queries: np.array,
+    indices: np.array,
     k_nn: int,
     ctx: "Ctx" = None):
     if ctx is None:
@@ -321,7 +321,7 @@ def dist_qv(
             active_partitions,
             query_vectors,
             active_queries,
-            indices,
+            StdVector_u64(indices),
             ids_uri,
             k_nn
         ]
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -2,6 +2,7 @@
 
 from tiledb.vector_search.ingestion import ingest
 from tiledb.vector_search.index import IVFFlatIndex
+from tiledb.cloud.dag import Mode
 
 MINIMUM_ACCURACY = 0.9
 
@@ -52,7 +53,10 @@ def test_ivf_flat_ingestion_u8(tmp_path):
     k = 10
     size = 100000
     partitions = 100
-    create_random_dataset_u8(nb=size, d=100, nq=10, k=k, path=dataset_dir)
+    dimensions = 128
+    nqueries = 100
+    nprobe = 20
+    create_random_dataset_u8(nb=size, d=dimensions, nq=nqueries, k=k, path=dataset_dir)
     source_type = "U8BIN"
     dtype = np.uint8
 
@@ -67,33 +71,40 @@ def test_ivf_flat_ingestion_u8(tmp_path):
         input_vectors_per_work_item=int(size / 10),
     )
     result = np.transpose(
-        index.query(np.transpose(query_vectors), k=k, nprobe=10)
+        index.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
     index_ram = IVFFlatIndex(uri=array_uri, dtype=dtype, memory_budget=int(size / 10))
     result = np.transpose(
-        index_ram.query(np.transpose(query_vectors), k=k, nprobe=partitions)
+        index_ram.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
     result = np.transpose(
         index_ram.query(
             np.transpose(query_vectors),
             k=k,
-            nprobe=partitions,
+            nprobe=nprobe,
             use_nuv_implementation=True,
         )
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
+    result = index_ram.distributed_query(np.transpose(query_vectors.astype(np.uint8)), k=k, nprobe=nprobe, mode=Mode.LOCAL)
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
 def test_ivf_flat_ingestion_f32(tmp_path):
+    import time
     dataset_dir = os.path.join(tmp_path, "dataset")
     array_uri = os.path.join(tmp_path, "array")
     k = 10
     size = 100000
+    dimensions = 128
     partitions = 100
-    create_random_dataset_f32(nb=size, d=100, nq=10, k=k, path=dataset_dir)
+    nqueries = 100
+    nprobe = 20
+
+    create_random_dataset_f32(nb=size, d=dimensions, nq=nqueries, k=k, path=dataset_dir)
     source_type = "F32BIN"
     dtype = np.float32
 
@@ -109,29 +120,29 @@ def test_ivf_flat_ingestion_f32(tmp_path):
         input_vectors_per_work_item=int(size / 10),
     )
 
-    result = index.distributed_query(np.transpose(query_vectors), k=k, nprobe=partitions)
-    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
-
     result = np.transpose(
-        index.query(np.transpose(query_vectors), k=k, nprobe=partitions)
+        index.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
     index_ram = IVFFlatIndex(uri=array_uri, dtype=dtype, memory_budget=int(size / 10))
     result = np.transpose(
-        index_ram.query(np.transpose(query_vectors), k=k, nprobe=partitions)
+        index_ram.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
     result = np.transpose(
         index_ram.query(
             np.transpose(query_vectors),
             k=k,
-            nprobe=partitions,
+            nprobe=nprobe,
             use_nuv_implementation=True,
         )
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
+    result = index_ram.distributed_query(np.transpose(query_vectors), k=k, nprobe=nprobe, mode=Mode.LOCAL)
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
 
 
 def test_ivf_flat_ingestion_fvec(tmp_path):
@@ -143,8 +154,9 @@ def test_ivf_flat_ingestion_fvec(tmp_path):
     array_uri = os.path.join(tmp_path, "array")
     k = 100
     dimensions = 128
-    partitions = 1000
+    partitions = 100
     nqueries = 100
+    nprobe = 20
 
     query_vectors = get_queries_fvec(
         queries_uri, dimensions=dimensions, nqueries=nqueries
@@ -159,21 +171,24 @@ def test_ivf_flat_ingestion_fvec(tmp_path):
         partitions=partitions,
     )
     result = np.transpose(
-        index.query(np.transpose(query_vectors), k=k, nprobe=partitions)
+        index.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
     index_ram = IVFFlatIndex(uri=array_uri, dtype=dtype)
     result = np.transpose(
-        index_ram.query(np.transpose(query_vectors), k=k, nprobe=partitions)
+        index_ram.query(np.transpose(query_vectors), k=k, nprobe=nprobe)
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
     result = np.transpose(
         index_ram.query(
             np.transpose(query_vectors),
             k=k,
-            nprobe=partitions,
+            nprobe=nprobe,
             use_nuv_implementation=True,
         )
     )
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
+    result = index_ram.distributed_query(np.transpose(query_vectors), k=k, nprobe=nprobe, mode=Mode.LOCAL)
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY