Enable OOC processing for IVF_FLAT distributed query execution (#418)

NikolaosPapailiou · web-flow · commit 00e1a2144199 · 2024-06-18T13:24:20.000+03:00
This applies OOC computation for IVF_FLAT distributed query execution by having a loop of `load()` calls and propagating `upper_bound` for `tdbPartitionedMatrix` 

This will allow us to avoid OOM errors in UDF workers by streaming the respective work partitions in configurable batches.

We don't yet pass `upper_bound` from Python as we need to first release this change in cloud UDF images. Instead we set a default of 200k vectors.  The default upper_bound of 200k is selected by assuming vectors with 1k dimensions each using 4 bytes. In this case, each load() operation would fetch at most 800MB of data.
diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py
@@ -1892,3 +1892,30 @@ def test_ivf_flat_ingestion_with_training_source_uri_numpy(tmp_path):
         expected_result_d=[[0]],
         expected_result_i=[[1003]],
     )
+
+
+def test_ivf_flat_taskgraph_query(tmp_path):
+    dataset_dir = os.path.join(tmp_path, "dataset")
+    index_uri = os.path.join(tmp_path, "array")
+    k = 10
+    size = 10000
+    partitions = 100
+    dimensions = 129
+    nqueries = 100
+    nprobe = 20
+    create_random_dataset_u8(nb=size, d=dimensions, nq=nqueries, k=k, path=dataset_dir)
+    dtype = np.uint8
+
+    queries = get_queries(dataset_dir, dtype=dtype)
+    gt_i, gt_d = get_groundtruth(dataset_dir, k)
+    index = ingest(
+        index_type="IVF_FLAT",
+        index_uri=index_uri,
+        source_uri=os.path.join(dataset_dir, "data.u8bin"),
+        partitions=partitions,
+        input_vectors_per_work_item=int(size / 10),
+    )
+    _, result = index._taskgraph_query(
+        queries, k=k, nprobe=nprobe, nthreads=8, mode=Mode.LOCAL, num_partitions=10
+    )
+    assert accuracy(result, gt_i) > MINIMUM_ACCURACY
diff --git a/src/include/detail/ivf/dist_qv.h b/src/include/detail/ivf/dist_qv.h
@@ -78,6 +78,10 @@ auto dist_qv_finite_ram_part(
     const std::string& id_uri,
     size_t k_nn,
     uint64_t timestamp = 0,
+    // The default upper_bound of 200k is selected by assuming vectors with
+    // 1k dimensions each using 4 bytes.
+    // In this case, each load() operation would fetch at most 800MB of data.
+    size_t upper_bound = 200000,
     size_t nthreads = std::thread::hardware_concurrency(),
     Distance&& distance = Distance{}) {
   if (nthreads == 0) {
@@ -109,70 +113,68 @@ auto dist_qv_finite_ram_part(
       global_indices,
       id_uri,
       dist_active_partitions,
-      0,
+      upper_bound,
       temporal_policy);
 
-  // We are assuming that we are not doing out of core computation here.
-  // (It is easy enough to change this if we need to.)
-  partitioned_vectors.load();
-
   scoped_timer _i{tdb_func__ + " in RAM"};
 
   auto min_scores =
       std::vector<fixed_min_pair_heap<score_type, shuffled_ids_type>>(
           num_queries,
           fixed_min_pair_heap<score_type, shuffled_ids_type>(k_nn));
 
-  if (::num_partitions(partitioned_vectors) != size(dist_active_partitions)) {
-    throw std::runtime_error(
-        "[dist_qv_finite_ram_part] num_partitions(partitioned_vectors) != "
-        "size(dist_active_partitions)");
-  }
-
-  auto current_part_size = ::num_partitions(partitioned_vectors);
-  size_t parts_per_thread = (current_part_size + nthreads - 1) / nthreads;
-
-  std::vector<std::future<decltype(min_scores)>> futs;
-  futs.reserve(nthreads);
+  size_t part_offset = 0;
+  while (partitioned_vectors.load()) {
+    _i.start();
+    auto current_part_size = ::num_partitions(partitioned_vectors);
+    size_t parts_per_thread = (current_part_size + nthreads - 1) / nthreads;
 
-  for (size_t n = 0; n < nthreads; ++n) {
-    auto first_part = std::min<size_t>(n * parts_per_thread, current_part_size);
-    auto last_part =
-        std::min<size_t>((n + 1) * parts_per_thread, current_part_size);
+    std::vector<std::future<decltype(min_scores)>> futs;
+    futs.reserve(nthreads);
 
-    if (first_part != last_part) {
-      futs.emplace_back(std::async(
-          std::launch::async,
-          [&query,
-           &partitioned_vectors,
-           &active_queries = dist_active_queries,
-           &distance,
-           k_nn,
-           first_part,
-           last_part]() {
-            return apply_query(
-                partitioned_vectors,
-                std::optional<std::vector<int>>{},
-                // std::optional{active_partitions},
-                query,
-                active_queries,
-                k_nn,
-                first_part,
-                last_part,
-                0,
-                distance);
-          }));
+    for (size_t n = 0; n < nthreads; ++n) {
+      auto first_part =
+          std::min<size_t>(n * parts_per_thread, current_part_size);
+      auto last_part =
+          std::min<size_t>((n + 1) * parts_per_thread, current_part_size);
+
+      if (first_part != last_part) {
+        futs.emplace_back(std::async(
+            std::launch::async,
+            [&query,
+             &partitioned_vectors,
+             &active_queries = dist_active_queries,
+             &distance,
+             k_nn,
+             first_part,
+             last_part,
+             part_offset]() {
+              return apply_query(
+                  partitioned_vectors,
+                  std::optional<std::vector<int>>{},
+                  // std::optional{dist_active_partitions},
+                  query,
+                  active_queries,
+                  k_nn,
+                  first_part,
+                  last_part,
+                  part_offset,
+                  distance);
+            }));
+      }
     }
-  }
+    for (size_t n = 0; n < size(futs); ++n) {
+      auto min_n = futs[n].get();
 
-  for (size_t n = 0; n < size(futs); ++n) {
-    auto min_n = futs[n].get();
-
-    for (size_t j = 0; j < num_queries; ++j) {
-      for (auto&& [e, f] : min_n[j]) {
-        min_scores[j].insert(e, f);
+      for (size_t j = 0; j < num_queries; ++j) {
+        for (auto&& [e, f] : min_n[j]) {
+          min_scores[j].insert(e, f);
+        }
       }
     }
+
+    part_offset += current_part_size;
+    _i.stop();
   }
   return min_scores;
 }
@@ -334,6 +336,7 @@ auto dist_qv_finite_ram(
               id_uri,
               k_nn,
               timestamp,
+              upper_bound,
               nthreads,
               distance);
 #else