TileDB-Inc
diff --git a/‎src/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎src/CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/benchmarks/ivf_flat_full.bash‎
Lines changed: 2 additions & 2 deletions b/‎src/benchmarks/ivf_flat_full.bash‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/benchmarks/setup.bash‎
Lines changed: 4 additions & 0 deletions b/‎src/benchmarks/setup.bash‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/include/concepts.h‎
Lines changed: 19 additions & 1 deletion b/‎src/include/concepts.h‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/include/detail/flat/gemm.h‎
Lines changed: 10 additions & 0 deletions b/‎src/include/detail/flat/gemm.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/include/detail/flat/qv.h‎
Lines changed: 14 additions & 10 deletions b/‎src/include/detail/flat/qv.h‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎src/include/detail/flat/vq.h‎
Lines changed: 6 additions & 5 deletions b/‎src/include/detail/flat/vq.h‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/include/detail/ivf/dist_qv.h‎
Lines changed: 59 additions & 3 deletions b/‎src/include/detail/ivf/dist_qv.h‎
Lines changed: 59 additions & 3 deletions
diff --git a/‎src/include/detail/ivf/partition.h‎
Lines changed: 1 addition & 4 deletions b/‎src/include/detail/ivf/partition.h‎
Lines changed: 1 addition & 4 deletions
@@ -101,12 +101,14 @@ endif()
 if (CMAKE_OSX_ARCHITECTURES STREQUAL arm64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
     set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-elide-constructors ${FCONCEPTS_DIAGNOSTICS_DEPTH} " CACHE STRING "" FORCE)
     set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -DNDEBUG " CACHE STRING "" FORCE)
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -DNDEBUG" CACHE STRING "" FORCE)
+#    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -DNDEBUG" CACHE STRING "" FORCE)
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -UNDEBUG" CACHE STRING "" FORCE)
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG " CACHE STRING "" FORCE)
 else()
     set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-elide-constructors ${FCONCEPTS_DIAGNOSTICS_DEPTH} " CACHE STRING "" FORCE)
     set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -DNDEBUG " CACHE STRING "" FORCE)
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -march=native -DNDEBUG" CACHE STRING "" FORCE)
+#    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -march=native -DNDEBUG" CACHE STRING "" FORCE)
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -march=native -UNDEBUG" CACHE STRING "" FORCE)
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -march=native -DNDEBUG " CACHE STRING "" FORCE)
 endif()
 
 
@@ -33,7 +33,8 @@ uptime
 
 printf "\n\n-----------------------------------------------------------------------------------------------------------------------------------------\n\n"
 
-if ping -c 1 -W 1250 169.254.169.254;
+#if ping -c 1 -W 1250 169.254.169.254;
+if [[ -d "/sys/hypervisor/uuid" ]]
 then
   echo "Running on EC2 instance"
   curl -s http://169.254.169.254/latest/meta-data/instance-type
@@ -65,7 +66,6 @@ do
     init_1B_${source}
     for blocksize in 0 1000000 10000000 ;
     do
-	log_header
 	for nqueries in 1 10 100 1000 10000;
 	do
 	    for nprobe in 1 2 4 8 16 32 64 128 ;
 
@@ -433,6 +433,10 @@ function ivf_query() {
 		local _nthreads="--nthreads ${2}"
 		shift 2
 		;;
+	    --ppt)
+		local _ppt="--ppt ${2}"
+		shift 2
+		;;
 	    --cluster|--nprobe)
 		local _cluster="--nprobe ${2}"
 		shift 2
 
@@ -39,6 +39,24 @@
 #include <span>
 #include <type_traits>
 
+template <typename T>
+concept has_load_member = requires(T&& t) {
+  t.load();
+};
+
+template <class T>
+constexpr bool is_loadable_v = has_load_member<T>;
+
+template <typename T>
+concept has_col_offset = requires(T&& t) {
+  t.col_offset();
+};
+
+template <typename T>
+concept has_num_col_parts = requires(T&& t) {
+  t.num_col_parts();
+};
+
 template <typename T>
 concept feature_vector = requires(T t) {
   typename T::value_type;
@@ -78,4 +96,4 @@ concept vector_database = requires(T t) {
 template <typename T>
 concept query_set = vector_database<T>;
 
-#endif
+#endif
@@ -43,6 +43,9 @@ namespace detail::flat {
 
 template <class DB, class Q>
 auto gemm_query(const DB& db, const Q& q, int k, bool nth, size_t nthreads) {
+  if constexpr (is_loadable_v<decltype(db)>) {
+    db.load();
+  }
   scoped_timer _{"Total time " + tdb_func__};
   auto scores = gemm_scores(db, q, nthreads);
   auto top_k = get_top_k(scores, k, nth, nthreads);
@@ -62,7 +65,11 @@ auto blocked_gemm_query(DB& db, Q& q, int k, bool nth, size_t nthreads) {
   std::vector<fixed_min_heap<element>> min_scores(
       size(q), fixed_min_heap<element>(k));
 
+  log_timer _i{tdb_func__ + " in RAM"};
+
   while (db.load()) {
+    _i.start();
+
     gemm_scores(db, q, scores, nthreads);
 
     auto par = stdx::execution::indexed_parallel_policy{nthreads};
@@ -72,8 +79,10 @@ auto blocked_gemm_query(DB& db, Q& q, int k, bool nth, size_t nthreads) {
             min_scores[i].insert({scores(j, i), j + db.col_offset()});
           }
         });
+    _i.stop();
   }
 
+  _i.start();
   ColMajorMatrix<size_t> top_k(k, q.num_cols());
   for (size_t j = 0; j < size(min_scores); ++j) {
     // @todo get_top_k_from_heap
@@ -84,6 +93,7 @@ auto blocked_gemm_query(DB& db, Q& q, int k, bool nth, size_t nthreads) {
         top_k[j].begin(),
         ([](auto&& e) { return e.second; }));
   }
+  _i.stop();
 
   return top_k;
 }
 
@@ -61,11 +61,13 @@ namespace detail::flat {
  */
 
 template <class DB, class Q>
-auto qv_query_nth(
-    const DB& db, const Q& q, int k, bool nth, unsigned int nthreads) {
-  scoped_timer _{tdb_func__};
+auto qv_query_nth(DB& db, const Q& q, int k, bool nth, unsigned int nthreads) {
+  if constexpr (is_loadable_v<decltype(db)>) {
+    db.load();
+  }
+  scoped_timer _{tdb_func__ + (nth ? std::string{"nth"} : std::string{"heap"})};
 
-  ColMajorMatrix<size_t> top_k(k, q.num_cols());
+  ColMajorMatrix<size_t> top_k(k, size(q));
 
   auto par = stdx::execution::indexed_parallel_policy{nthreads};
   stdx::range_for_each(
@@ -96,10 +98,12 @@ auto qv_query_nth(
  *
  */
 template <vector_database DB, class Q>
-auto qv_query_heap(const DB& db, const Q& q, size_t k, unsigned nthreads) {
-  scoped_timer _{tdb_func__};
+auto qv_query_heap(DB& db, const Q& q, size_t k, unsigned nthreads) {
+  if constexpr (is_loadable_v<decltype(db)>) {
+    db.load();
+  }
 
-  using element = std::pair<float, int>;
+  scoped_timer _{tdb_func__};
 
   ColMajorMatrix<size_t> top_k(k, q.num_cols());
 
@@ -124,12 +128,12 @@ auto qv_query_heap(const DB& db, const Q& q, size_t k, unsigned nthreads) {
       futs.emplace_back(std::async(
           std::launch::async, [k, start, stop, size_db, &q, &db, &top_k]() {
             for (size_t j = start; j < stop; ++j) {
-              fixed_min_heap<element> min_scores(k);
+              fixed_min_pair_heap<float, size_t> min_scores(k);
               size_t idx = 0;
 
               for (size_t i = 0; i < size_db; ++i) {
                 auto score = L2(q[j], db[i]);
-                min_scores.insert(element{score, i});
+                min_scores.insert(score, i);
               }
 
               // @todo use get_top_k_from_heap
@@ -138,7 +142,7 @@ auto qv_query_heap(const DB& db, const Q& q, size_t k, unsigned nthreads) {
                   min_scores.begin(),
                   min_scores.end(),
                   top_k[j].begin(),
-                  ([](auto&& e) { return e.second; }));
+                  ([](auto&& e) { return std::get<1>(e); }));
             }
           }));
     }
 
@@ -52,10 +52,11 @@ namespace detail::flat {
  * scores matrix (and which could also be used for out-of core).
  */
 template <class DB, class Q>
-auto vq_query_nth(const DB& db, const Q& q, int k, bool nth, int nthreads) {
-  scoped_timer _{"Total time " + tdb_func__};
-
-  // scoped_timer _{tdb_func__ + ", nth = " + std::to_string(nth)};
+auto vq_query_nth(DB& db, const Q& q, int k, bool nth, int nthreads) {
+  if constexpr (is_loadable_v<decltype(db)>) {
+    db.load();
+  }
+  scoped_timer _{tdb_func__ + (nth ? std::string{"nth"} : std::string{"heap"})};
 
   ColMajorMatrix<float> scores(db.num_cols(), q.num_cols());
 
@@ -123,7 +124,7 @@ auto vq_query_heap(DB& db, Q& q, int k, unsigned nthreads) {
         [&, size_q](auto&& db_vec, auto&& n = 0, auto&& i = 0) {
           for (size_t j = 0; j < size_q; ++j) {
             auto score = L2(q[j], db_vec);
-            scores[n][j].insert(element{score, i + db.offset()});
+            scores[n][j].insert(element{score, i + db.col_offset()});
           }
         });
     _i.stop();
 
@@ -47,6 +47,8 @@
 #include "stats.h"
 #include "utils/fixed_min_queues.h"
 
+#include "detail/ivf/qv.h"
+
 namespace detail::ivf {
 
 /**
@@ -90,7 +92,9 @@ auto dist_qv_finite_ram_part(
       shuffled_ids_type,
       indices_type,
       parts_type>(ctx, part_uri, indices, active_partitions, id_uri, 0);
-  // !! Make sure to load the data into the matrix
+
+  // We are assuming that we are not doing out of core computation here.
+  // (It is easy enough to change this if we need to.)
   shuffled_db.load();
 
   scoped_timer _i{tdb_func__ + " in RAM"};
@@ -105,9 +109,61 @@ auto dist_qv_finite_ram_part(
     new_indices[i + 1] = new_indices[i] + indices[active_partitions[i] + 1] -
                          indices[active_partitions[i]];
   }
-
   assert(shuffled_db.num_cols() == size(shuffled_db.ids()));
 
+  auto min_scores = std::vector<fixed_min_pair_heap<float, size_t>>(
+      num_queries, fixed_min_pair_heap<float, size_t>(k_nn));
+
+  auto current_part_size = shuffled_db.num_col_parts();
+
+  size_t parts_per_thread = (current_part_size + nthreads - 1) / nthreads;
+
+  std::vector<std::future<decltype(min_scores)>> futs;
+  futs.reserve(nthreads);
+
+  for (size_t n = 0; n < nthreads; ++n) {
+    auto first_part = std::min<size_t>(n * parts_per_thread, current_part_size);
+    auto last_part =
+        std::min<size_t>((n + 1) * parts_per_thread, current_part_size);
+
+    if (first_part != last_part) {
+      futs.emplace_back(std::async(
+          std::launch::async,
+          [&query,
+           &shuffled_db,
+           &new_indices,
+           &active_queries = active_queries,
+           &active_partitions = active_partitions,
+           k_nn,
+           first_part,
+           last_part]() {
+            return apply_query(
+                query,
+                shuffled_db,
+                new_indices,
+                active_queries,
+                shuffled_db.ids(),
+                active_partitions,
+                k_nn,
+                first_part,
+                last_part);
+          }));
+    }
+  }
+
+  for (size_t n = 0; n < size(futs); ++n) {
+    auto min_n = futs[n].get();
+
+    for (size_t j = 0; j < num_queries; ++j) {
+      for (auto&& e : min_n[j]) {
+        min_scores[j].insert(std::get<0>(e), std::get<1>(e));
+      }
+    }
+  }
+  return min_scores;
+}
+
+#if 0
   auto min_scores =
       std::vector<std::vector<fixed_min_pair_heap<float, size_t>>>(
           nthreads,
@@ -177,7 +233,7 @@ auto dist_qv_finite_ram_part(
   }
 
   return min_min_scores;
-}
+#endif
 
 template <typename T, class shuffled_ids_type>
 auto dist_qv_finite_ram(
 
@@ -53,10 +53,7 @@ namespace detail::ivf {
  *
  */
 auto partition_ivf_index(
-    auto&& centroids,
-    auto&& query,
-    size_t nprobe,
-    size_t nthreads) {
+    auto&& centroids, auto&& query, size_t nprobe, size_t nthreads) {
   scoped_timer _{tdb_func__};
 
   size_t dimension = centroids.num_rows();