Fix empty results

Nikos Papailiou · Nikos Papailiou · commit e089a2f73e8e · 2023-09-19T18:38:34.000+03:00
diff --git a/apis/python/src/tiledb/vector_search/module.cc b/apis/python/src/tiledb/vector_search/module.cc
@@ -126,7 +126,7 @@ static void declare_qv_query_heap_infinite_ram(py::module& m, const std::string&
          size_t k_nn,
          size_t nthreads) -> py::tuple { //std::pair<ColMajorMatrix<float>, ColMajorMatrix<size_t>> { // TODO change return type
 
-        auto r = detail::ivf::qv_query_heap_infinite_ram(
+        auto r = detail::ivf::qv_query_heap_infinite_ram<Id_Type>(
             parts,
             centroids,
             query_vectors,
diff --git a/src/include/detail/flat/vq.h b/src/include/detail/flat/vq.h
@@ -138,7 +138,7 @@ auto vq_query_heap(
   }
 
   consolidate_scores(scores);
-  auto top_k = get_top_k_with_scores(scores, k_nn);
+  auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);
 
   return top_k;
 }
@@ -223,7 +223,7 @@ auto vq_query_heap_tiled(
   } while (load(db));
 
   consolidate_scores(scores);
-  auto top_k = get_top_k_with_scores(scores, k_nn);
+  auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);
 
   return top_k;
 }
@@ -300,7 +300,7 @@ auto vq_query_heap_2(
   } while (load(db));
 
   consolidate_scores(scores);
-  auto top_k = get_top_k_with_scores(scores, k_nn);
+  auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);
 
   return top_k;
 }
diff --git a/src/include/detail/ivf/qv.h b/src/include/detail/ivf/qv.h
@@ -78,6 +78,7 @@ namespace detail::ivf {
  * Overload for already opened arrays.  Since the array is already opened, we
  * don't need to specify its type with a template parameter.
  */
+template <class ids_type = size_t>
 auto qv_query_heap_infinite_ram(
     auto&& partitioned_db,
     auto&& centroids,
@@ -113,7 +114,7 @@ auto qv_query_heap_infinite_ram(
   auto partitioned_db = tdbColMajorMatrix<T>(ctx, part_uri);
   auto partitioned_ids = read_vector<partitioned_ids_type>(ctx, id_uri);
 
-  return qv_query_heap_infinite_ram(
+  return qv_query_heap_infinite_ram<partitioned_ids_type>(
       partitioned_db,
       centroids,
       q,
@@ -148,6 +149,7 @@ auto qv_query_heap_infinite_ram(
  * @param nthreads How many threads to use for parallel execution
  * @return The indices of the top_k neighbors for each query vector
  */
+template <class ids_type = size_t>
 auto qv_query_heap_infinite_ram(
     const std::string& part_uri,
     auto&& centroids,
@@ -158,7 +160,7 @@ auto qv_query_heap_infinite_ram(
     size_t k_nn,
     size_t nthreads) {
   tiledb::Context ctx;
-  return qv_query_heap_infinite_ram(
+  return qv_query_heap_infinite_ram<ids_type>(
       ctx, part_uri, centroids, q, indices, id_uri, nprobe, k_nn, nthreads);
 }
 
@@ -188,6 +190,7 @@ auto qv_query_heap_infinite_ram(
  * @return The indices of the top_k neighbors for each query vector
  */
 // @todo We should still order the queries so partitions are searched in order
+template <class ids_type>
 auto qv_query_heap_infinite_ram(
     auto&& partitioned_db,
     auto&& centroids,
@@ -221,8 +224,8 @@ auto qv_query_heap_infinite_ram(
   auto top_centroids =
       detail::flat::qv_query_heap_0(centroids, q, nprobe, nthreads);
 
-  auto min_scores = std::vector<fixed_min_pair_heap<float, size_t>>(
-      size(q), fixed_min_pair_heap<float, size_t>(k_nn));
+  auto min_scores = std::vector<fixed_min_pair_heap<float, ids_type>>(
+      size(q), fixed_min_pair_heap<float, ids_type>(k_nn));
 
   // Parallelizing over q is not going to be very efficient
   {
@@ -244,7 +247,7 @@ auto qv_query_heap_infinite_ram(
         });
   }
 
-  auto top_k = get_top_k_with_scores(min_scores, k_nn);
+  auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, ids_type>, ids_type>(min_scores, k_nn);
   return top_k;
 }
 
diff --git a/src/include/scoring.h b/src/include/scoring.h
@@ -47,6 +47,7 @@
 #include <cmath>
 #include <future>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <queue>
@@ -59,9 +60,6 @@
 #include "utils/fixed_min_heap.h"
 #include "utils/timer.h"
 
-
-
-
 // ----------------------------------------------------------------------------
 // Helper utilities
 //----------------------------------------------------------------------------
@@ -292,7 +290,7 @@ inline auto get_top_k(std::vector<std::vector<Heap>>& scores, size_t k_nn) {
 // ----------------------------------------------------------------------------
 // Functions for computing top k neighbors with scores
 // ----------------------------------------------------------------------------
-
+template <class Index = size_t, class score_type = float>
 inline void get_top_k_with_scores_from_heap(
     auto&& min_scores, auto&& top_k, auto&& top_k_scores) {
   std::sort_heap(begin(min_scores), end(min_scores), [](auto&& a, auto&& b) {
@@ -306,6 +304,10 @@ inline void get_top_k_with_scores_from_heap(
       begin(min_scores), end(min_scores), begin(top_k), ([](auto&& e) {
         return std::get<1>(e);
       }));
+  for (size_t i = min_scores.size(); i < top_k.size(); ++i) {
+    top_k[i] = std::numeric_limits<Index>::max();
+    top_k_scores[i] = std::numeric_limits<score_type>::max();
+  }
 }
 
 // Overload for one-d scores
@@ -320,7 +322,7 @@ inline auto get_top_k_with_scores(std::vector<Heap>& scores, size_t k_nn) {
   ColMajorMatrix<score_type> top_scores(k_nn, num_queries);
 
   for (size_t j = 0; j < num_queries; ++j) {
-    get_top_k_with_scores_from_heap(scores[j], top_k[j], top_scores[j]);
+    get_top_k_with_scores_from_heap<Index, score_type>(scores[j], top_k[j], top_scores[j]);
   }
   return std::make_tuple(std::move(top_scores), std::move(top_k));
 }
diff --git a/src/include/utils/fixed_min_heap.h b/src/include/utils/fixed_min_heap.h
@@ -57,11 +57,8 @@ class fixed_min_set_heap_1 : public std::vector<T> {
 
   void insert(T const& x) {
     if (Base::size() < max_size) {
-      Base::push_back(x);
-      // std::push_heap(begin(*this), end(*this), std::less<T>());
-      if (Base::size() == max_size) {
-        std::make_heap(begin(*this), end(*this), std::less<T>());
-      }
+      this->push_back(x);
+      std::push_heap(begin(*this), end(*this), std::less<T>());
     } else if (x < this->front()) {
       std::pop_heap(begin(*this), end(*this), std::less<T>());
       this->pop_back();
@@ -91,12 +88,8 @@ class fixed_min_set_heap_2 : public std::vector<T> {
 
   void insert(T const& x) {
     if (Base::size() < max_size) {
-      Base::push_back(x);
-      // std::push_heap(begin(*this), end(*this), std::less<T>());
-      if (Base::size() == max_size) {
-        // std::make_heap(begin(*this), end(*this), std::less<T>());
-        std::make_heap(begin(*this), end(*this));
-      }
+      this->push_back(x);
+      std::push_heap(begin(*this), end(*this));
     } else if (x < this->front()) {
       // std::pop_heap(begin(*this), end(*this), std::less<T>());
       std::pop_heap(begin(*this), end(*this));
@@ -138,13 +131,10 @@ class fixed_min_pair_heap : public std::vector<std::tuple<T, U>> {
 
   void insert(const T& x, const U& y) {
     if (Base::size() < max_size) {
-      Base::emplace_back(x, y);
-      // std::push_heap(begin(*this), end(*this), std::less<T>());
-      if (Base::size() == max_size) {
-        std::make_heap(begin(*this), end(*this), [&](auto& a, auto& b) {
-          return std::get<0>(a) < std::get<0>(b);
-        });
-      }
+      this->emplace_back(x, y);
+      std::push_heap(begin(*this), end(*this), [&](auto& a, auto& b) {
+        return std::get<0>(a) < std::get<0>(b);
+      });
     } else if (x < std::get<0>(this->front())) {
       std::pop_heap(begin(*this), end(*this), [&](auto& a, auto& b) {
         return std::get<0>(a) < std::get<0>(b);

Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ auto vq_query_heap(`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`consolidate_scores(scores);`
`141`		`- auto top_k = get_top_k_with_scores(scores, k_nn);`
	`141`	`+ auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);`
`142`	`142`
`143`	`143`	`return top_k;`
`144`	`144`	`}`
`@@ -223,7 +223,7 @@ auto vq_query_heap_tiled(`
`223`	`223`	`} while (load(db));`
`224`	`224`
`225`	`225`	`consolidate_scores(scores);`
`226`		`- auto top_k = get_top_k_with_scores(scores, k_nn);`
	`226`	`+ auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);`
`227`	`227`
`228`	`228`	`return top_k;`
`229`	`229`	`}`
`@@ -300,7 +300,7 @@ auto vq_query_heap_2(`
`300`	`300`	`} while (load(db));`
`301`	`301`
`302`	`302`	`consolidate_scores(scores);`
`303`		`- auto top_k = get_top_k_with_scores(scores, k_nn);`
	`303`	`+ auto top_k = get_top_k_with_scores<fixed_min_pair_heap<float, Index>, Index>(scores, k_nn);`
`304`	`304`
`305`	`305`	`return top_k;`
`306`	`306`	`}`