WIP

Nikos Papailiou · Nikos Papailiou · commit 58d2ce74dfd4 · 2023-07-10T18:55:19.000+03:00
diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -1,4 +1,5 @@
 import os
+import math
 
 import numpy as np
 from tiledb.vector_search.module import *
@@ -44,7 +45,7 @@ def query(
         nprobe: int = 1,
     ):
         """
-        Open a flat index
+        Query a flat index
 
         Parameters
         ----------
@@ -117,7 +118,7 @@ def query(
         use_nuv_implementation: bool = False,
     ):
         """
-        Open a flat index
+        Query an IVF_FLAT index
 
         Parameters
         ----------
@@ -128,7 +129,7 @@ def query(
         nqueries: int
             Number of queries
         nthreads: int
-            Number of threads to use for queyr
+            Number of threads to use for query
         nprobe: int
             number of probes
         use_nuv_implementation: bool
@@ -170,3 +171,66 @@ def query(
             )
 
         return np.array(r)
+
+    def distributed_query(
+        self,
+        targets: np.ndarray,
+        k=10,
+        nthreads=8,
+        nprobe=1,
+        num_nodes=5,
+    ):
+        """
+        Distributed Query on top of an IVF_FLAT index
+
+        Parameters
+        ----------
+        targets: numpy.ndarray
+            ND Array of query targets
+        k: int
+            Number of top results to return per target
+        nqueries: int
+            Number of queries
+        nthreads: int
+            Number of threads to use for query
+        nprobe: int
+            number of probes
+        """
+        assert targets.dtype == np.float32
+
+        targets_m = array_to_matrix(targets)
+        active_partitions, active_queries = partition_ivf_index(
+            centroids=self._centroids,
+            query=targets_m,
+            nprobe=nprobe,
+            nthreads=nthreads)
+        num_parts = len(active_partitions)
+
+        parts_per_node = int(math.ceil(num_parts / num_nodes))
+        results = []
+        for part in range(0, num_parts, parts_per_node):
+            part_end = part + parts_per_node
+            if part_end > num_parts:
+                part_end = num_parts
+            results.append(dist_qv(
+                dtype=self.dtype,
+                parts_uri=self.parts_db_uri,
+                ids_uri=self.ids_uri,
+                query_vectors=targets_m,
+                active_partitions=active_partitions[part:part_end],
+                active_queries=active_queries[part:part_end],
+                indices=self._index,
+                k_nn=k,
+                ctx=self.ctx,
+            ))
+
+        results_per_query = []
+        for q in range(targets.shape[1]):
+            tmp_results = []
+            for j in range(k):
+                for r in results:
+                    if len(r[q]) > 0:
+                        if r[q][j][0] > 0:
+                            tmp_results.append(r[q][j])
+            results_per_query.append(sorted(tmp_results, key=lambda t: t[0])[0:k])
+        return results_per_query
diff --git a/apis/python/src/tiledb/vector_search/module.cc b/apis/python/src/tiledb/vector_search/module.cc
@@ -268,6 +268,16 @@ static void declare_ivf_index_tdb(py::module& m, const std::string& suffix) {
         }, py::keep_alive<1,2>());
 }
 
+template <class T=float, class U=size_t>
+static void declareFixedMinPairHeap(py::module& mod) {
+  using PyFixedMinPairHeap = py::class_<fixed_min_pair_heap<T, U>>;
+  PyFixedMinPairHeap cls(mod, "FixedMinPairHeap", py::buffer_protocol());
+
+  cls.def(py::init<unsigned>());
+  cls.def("insert", &fixed_min_pair_heap<T, U>::insert);
+  cls.def("__len__", [](const fixed_min_pair_heap<T, U> &v) { return v.size(); });
+  cls.def("__getitem__", [](fixed_min_pair_heap<T, U>& v, size_t i) { return v[i]; });
+}
 
 // Declarations for typed subclasses of ColMajorMatrix
 template <typename P>
@@ -346,14 +356,14 @@ void declarePartitionIvfIndex(py::module& m, const std::string& suffix) {
         );
 }
 
-template <typename query_type, typename shuffled_ids_type = size_t>
+template <typename query_type, typename shuffled_ids_type = uint64_t>
 static void declare_dist_qv(py::module& m, const std::string& suffix) {
   m.def(("dist_qv_" + suffix).c_str(),
       [](tiledb::Context& ctx,
         const std::string& part_uri,
-        std::vector<shuffled_ids_type>& active_partitions,
+        std::vector<int>& active_partitions,
         ColMajorMatrix<query_type>& query,
-        std::vector<std::vector<shuffled_ids_type>>& active_queries,
+        std::vector<std::vector<int>>& active_queries,
         std::vector<shuffled_ids_type>& indices,
         const std::string& id_uri,
         size_t k_nn
@@ -508,5 +518,7 @@ PYBIND11_MODULE(_tiledbvspy, m) {
   declarePartitionedMatrix<tdbColMajorPartitionedMatrix<uint8_t, uint64_t, uint64_t, uint64_t > >(m, "tdbPartitionedMatrix", "u8");
   declarePartitionedMatrix<tdbColMajorPartitionedMatrix<float, uint64_t, uint64_t, uint64_t> >(m, "tdbPartitionedMatrix", "f32");
 
+  declare_dist_qv<uint8_t>(m, "u8");
   declare_dist_qv<float>(m, "f32");
+  declareFixedMinPairHeap(m);
 }
diff --git a/apis/python/src/tiledb/vector_search/module.py b/apis/python/src/tiledb/vector_search/module.py
@@ -302,6 +302,36 @@ def partition_ivf_index(centroids, query, nprobe=1, nthreads=0):
     else:
         raise TypeError("Unsupported type!")
 
+def dist_qv(
+    dtype: np.dtype,
+    parts_uri: str,
+    ids_uri: str,
+    query_vectors: "colMajorMatrix",
+    active_partitions: "Vector",
+    active_queries: "Vector",
+    indices: "Vector",
+    k_nn: int,
+    ctx: "Ctx" = None):
+    if ctx is None:
+        ctx = Ctx({})
+    args = tuple(
+        [
+            ctx,
+            parts_uri,
+            active_partitions,
+            query_vectors,
+            active_queries,
+            indices,
+            ids_uri,
+            k_nn
+        ]
+    )
+    if dtype == np.float32:
+        return dist_qv_f32(*args)
+    elif dtype == np.uint8:
+        return dist_qv_u8(*args)
+    else:
+        raise TypeError("Unsupported type!")
 
 def validate_top_k(results: np.ndarray, ground_truth: np.ndarray):
     if results.dtype == np.uint64:
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -46,7 +46,6 @@ def test_flat_ingestion_f32(tmp_path):
     result = np.transpose(index.query(np.transpose(query_vectors), k=k))
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
-
 def test_ivf_flat_ingestion_u8(tmp_path):
     dataset_dir = os.path.join(tmp_path, "dataset")
     array_uri = os.path.join(tmp_path, "array")
@@ -109,6 +108,10 @@ def test_ivf_flat_ingestion_f32(tmp_path):
         partitions=partitions,
         input_vectors_per_work_item=int(size / 10),
     )
+
+    result = index.distributed_query(np.transpose(query_vectors), k=k, nprobe=partitions)
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
+
     result = np.transpose(
         index.query(np.transpose(query_vectors), k=k, nprobe=partitions)
     )
@@ -130,6 +133,7 @@ def test_ivf_flat_ingestion_f32(tmp_path):
     assert accuracy(result, gt_i) > MINIMUM_ACCURACY
 
 
+
 def test_ivf_flat_ingestion_fvec(tmp_path):
     source_uri = "test/data/siftsmall/siftsmall_base.fvecs"
     queries_uri = "test/data/siftsmall/siftsmall_query.fvecs"