feat: add function for scheduling chunked computations (PROOF-928) (#255)

rnburn · web-flow · commit 1f579c0385db · 2025-04-01T11:26:04.000-07:00
* add function to schedule asynchronous chunked computations

* remove print statements

* doc

* change signature

* drop dead code
diff --git a/sxt/execution/device/BUILD b/sxt/execution/device/BUILD
@@ -3,6 +3,14 @@ load(
     "sxt_cc_component",
 )
 
+sxt_cc_component(
+    name = "chunk_context",
+    with_test = False,
+    deps = [
+        "//sxt/execution/async:shared_future",
+    ],
+)
+
 sxt_cc_component(
     name = "device_viewable",
     test_deps = [
@@ -78,19 +86,24 @@ sxt_cc_component(
     name = "for_each",
     impl_deps = [
         ":available_device",
-        "//sxt/execution/async:future",
         "//sxt/execution/async:coroutine",
         "//sxt/base/device:active_device_guard",
         "//sxt/base/device:property",
+        "//sxt/base/device:state",
         "//sxt/base/iterator:split",
     ],
     test_deps = [
+        "//sxt/base/error:assert",
         "//sxt/base/iterator:index_range",
+        "//sxt/base/iterator:index_range_iterator",
         "//sxt/base/test:unit_test",
         "//sxt/execution/async:future",
     ],
     deps = [
-        "//sxt/execution/async:future_fwd",
+        ":chunk_context",
+        "//sxt/base/device:stream",
+        "//sxt/execution/async:future",
+        "//sxt/execution/async:shared_future",
         "//sxt/execution/schedule:scheduler",
     ],
 )
diff --git a/sxt/execution/device/chunk_context.cc b/sxt/execution/device/chunk_context.cc
@@ -0,0 +1,17 @@
+/** Proofs GPU - Space and Time's cryptographic proof algorithms on the CPU and GPU.
+ *
+ * Copyright 2025-present Space and Time Labs, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "sxt/execution/device/chunk_context.h"
diff --git a/sxt/execution/device/chunk_context.h b/sxt/execution/device/chunk_context.h
@@ -0,0 +1,56 @@
+/** Proofs GPU - Space and Time's cryptographic proof algorithms on the CPU and GPU.
+ *
+ * Copyright 2025-present Space and Time Labs, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "sxt/execution/async/shared_future.h"
+
+namespace sxt::xendv {
+//--------------------------------------------------------------------------------------------------
+// chunk_context
+//--------------------------------------------------------------------------------------------------
+/**
+ * Give context for an individual chunk of a chunked computation
+ */
+struct chunk_context {
+  // a counter tracking the processing index for the given chunk
+  unsigned chunk_index = 0;
+
+  // the device used to process the chunk
+  unsigned device_index = 0;
+
+  // the total number of devices used to process the collection of chunks
+  unsigned num_devices_used = 0;
+
+  // When two chunks are scheduled for the same device, alt_future gives
+  // a handle to the asynchronous computation associated with the other
+  // chunk.
+  //
+  // alt_future can be used to overlap memory transfer with kernel computation. For
+  // example, a functor to process chunks might look something like this
+  //    f(const chunk_context& ctx, index_range rng) noexcept -> xena::future<> {
+  //        ...
+  //        async_copy_memory(stream, ...);
+  //
+  //        co_await ctx.alt_future;
+  //            // wait for the other future to finish so that we don't oversubscribe the GPU
+  //
+  //         launch_kernel(stream, ...);
+  //         co_await synchronize_stream(stream);
+  //    }
+  xena::shared_future<> alt_future;
+};
+} // namespace sxt::xendv
diff --git a/sxt/execution/device/for_each.cc b/sxt/execution/device/for_each.cc
@@ -18,12 +18,37 @@
 
 #include "sxt/base/device/active_device_guard.h"
 #include "sxt/base/device/property.h"
+#include "sxt/base/device/state.h"
 #include "sxt/base/iterator/split.h"
 #include "sxt/execution/async/coroutine.h"
 #include "sxt/execution/async/future.h"
 #include "sxt/execution/device/available_device.h"
 
 namespace sxt::xendv {
+//--------------------------------------------------------------------------------------------------
+// for_each_device_impl
+//--------------------------------------------------------------------------------------------------
+static xena::future<> for_each_device_impl(
+    chunk_context* ctx, chunk_context* ctx_p, unsigned& chunk_index,
+    basit::index_range_iterator& iter, basit::index_range_iterator last,
+    std::function<xena::future<>(const chunk_context& ctx, basit::index_range)> f) noexcept {
+  auto device_index = ctx->device_index;
+  while (true) {
+    if (iter == last) {
+      co_await ctx_p->alt_future;
+      co_return;
+    }
+    auto chunk = *iter++;
+    ctx_p->chunk_index = chunk_index++;
+    basdv::set_device(device_index);
+    auto fut = f(*ctx_p, chunk);
+    co_await ctx_p->alt_future;
+    ctx->alt_future = std::move(fut);
+    std::swap(ctx, ctx_p);
+  }
+  co_await ctx->alt_future;
+}
+
 //--------------------------------------------------------------------------------------------------
 // concurrent_for_each
 //--------------------------------------------------------------------------------------------------
@@ -51,4 +76,53 @@ concurrent_for_each(basit::index_range rng,
   auto [first, last] = basit::split(rng, split_options);
   return concurrent_for_each(first, last, f);
 }
+
+//--------------------------------------------------------------------------------------------------
+// for_each_device
+//--------------------------------------------------------------------------------------------------
+xena::future<> for_each_device(
+    basit::index_range_iterator first, basit::index_range_iterator last,
+    std::function<xena::future<>(const chunk_context& ctx, basit::index_range)> f) noexcept {
+  if (first == last) {
+    co_return;
+  }
+
+  unsigned chunk_index = 0;
+  auto num_chunks = static_cast<unsigned>(std::distance(first, last));
+  auto num_devices = basdv::get_num_devices();
+  auto num_devices_used = static_cast<unsigned>(std::min(num_chunks, num_devices));
+
+  basdv::active_device_guard guard;
+
+  // set up contexts
+  std::vector<chunk_context> contexts(num_devices_used);
+  for (unsigned device_index = 0; device_index < num_devices_used; ++device_index) {
+    auto& ctx = contexts[device_index];
+    ctx.device_index = device_index;
+    ctx.alt_future = xena::make_ready_future();
+    ctx.num_devices_used = num_devices_used;
+  }
+  std::vector<chunk_context> contexts_p(contexts);
+
+  // initial launches
+  for (unsigned device_index = 0; device_index < num_devices_used; ++device_index) {
+    auto& ctx = contexts[device_index];
+    ctx.chunk_index = chunk_index++;
+    auto chunk = *first++;
+    basdv::set_device(device_index);
+    contexts_p[device_index].alt_future = f(ctx, chunk);
+  }
+
+  // continue launching until all chunks are processed
+  std::vector<xena::future<>> futs(num_devices_used);
+  for (unsigned device_index = 0; device_index < num_devices_used; ++device_index) {
+    futs[device_index] = for_each_device_impl(&contexts[device_index], &contexts_p[device_index],
+                                              chunk_index, first, last, f);
+  }
+
+  // wait for everything to finish
+  for (auto& fut : futs) {
+    co_await std::move(fut);
+  }
+}
 } // namespace sxt::xendv
diff --git a/sxt/execution/device/for_each.h b/sxt/execution/device/for_each.h
@@ -17,8 +17,12 @@
 #pragma once
 
 #include <functional>
+#include <optional>
 
-#include "sxt/execution/async/future_fwd.h"
+#include "sxt/base/device/stream.h"
+#include "sxt/execution/async/future.h"
+#include "sxt/execution/async/shared_future.h"
+#include "sxt/execution/device/chunk_context.h"
 
 namespace sxt::basit {
 class index_range;
@@ -44,4 +48,15 @@ concurrent_for_each(basit::index_range_iterator first, basit::index_range_iterat
 xena::future<>
 concurrent_for_each(basit::index_range rng,
                     std::function<xena::future<>(const basit::index_range&)> f) noexcept;
+
+//--------------------------------------------------------------------------------------------------
+// for_each_device
+//--------------------------------------------------------------------------------------------------
+/**
+ * Invoke the function f on the range of chunks provided, splitting the work across available
+ * devices.
+ */
+xena::future<> for_each_device(
+    basit::index_range_iterator first, basit::index_range_iterator last,
+    std::function<xena::future<>(const chunk_context& ctx, basit::index_range)> f) noexcept;
 } // namespace sxt::xendv
diff --git a/sxt/execution/device/for_each.t.cc b/sxt/execution/device/for_each.t.cc
@@ -16,10 +16,15 @@
  */
 #include "sxt/execution/device/for_each.h"
 
+#include <algorithm>
+#include <numeric>
+#include <random>
 #include <utility>
 #include <vector>
 
+#include "sxt/base/error/assert.h"
 #include "sxt/base/iterator/index_range.h"
+#include "sxt/base/iterator/index_range_iterator.h"
 #include "sxt/base/test/unit_test.h"
 #include "sxt/execution/async/future.h"
 
@@ -67,3 +72,118 @@ TEST_CASE("we can concurrently invoke code on different GPUs") {
     REQUIRE(t == 11);
   }
 }
+
+TEST_CASE("we can manage asynchronous chunked computations") {
+  std::vector<std::pair<unsigned, unsigned>> ranges;
+  std::vector<xena::promise<int>> promises(10);
+
+  SECTION("we iterate over no chunks") {
+    basit::index_range_iterator iter{basit::index_range{2, 2}, 1};
+    auto fut = for_each_device(
+        iter, iter, [&](const chunk_context& ctx, basit::index_range rng) -> xena::future<> {
+          return xena::future<int>{promises[0]}.then([](int /*val*/) noexcept {});
+        });
+    REQUIRE(fut.ready());
+  }
+
+  SECTION("we can iterate over a single chunk") {
+    basit::index_range_iterator first{basit::index_range{0, 1}, 1};
+    basit::index_range_iterator last{basit::index_range{1, 1}, 1};
+    auto fut = for_each_device(
+        first, last, [&](const chunk_context& ctx, basit::index_range rng) -> xena::future<> {
+          ranges.emplace_back(rng.a(), rng.b());
+          return xena::future<int>{promises[0]}.then(
+              [&](int val) noexcept { SXT_RELEASE_ASSERT(val == 123); });
+        });
+    REQUIRE(!fut.ready());
+    promises[0].set_value(123);
+    REQUIRE(fut.ready());
+    std::vector<std::pair<unsigned, unsigned>> expected = {{0, 1}};
+    REQUIRE(ranges == expected);
+  }
+
+  SECTION("we can iterate over two chunks") {
+    basit::index_range_iterator first{basit::index_range{0, 2}, 1};
+    basit::index_range_iterator last{basit::index_range{2, 2}, 1};
+    auto fut = for_each_device(
+        first, last, [&](const chunk_context& ctx, basit::index_range rng) -> xena::future<> {
+          ranges.emplace_back(rng.a(), rng.b());
+          return xena::future<int>{promises[ctx.chunk_index]}.then(
+              [chunk_index = ctx.chunk_index](int val) noexcept {
+                if (chunk_index == 0) {
+                  SXT_RELEASE_ASSERT(val == 123);
+                } else {
+                  SXT_RELEASE_ASSERT(val == 456);
+                }
+              });
+        });
+    REQUIRE(!fut.ready());
+    promises[0].set_value(123);
+    REQUIRE(!fut.ready());
+    promises[1].set_value(456);
+    REQUIRE(fut.ready());
+    std::vector<std::pair<unsigned, unsigned>> expected = {{0, 1}, {1, 2}};
+    REQUIRE(ranges == expected);
+  }
+
+  SECTION("we can iterate over different chunk sizes") {
+    for (unsigned k = 3; k < 10; ++k) {
+      promises.clear();
+      ranges.clear();
+      promises.resize(k);
+      basit::index_range_iterator first{basit::index_range{0, k}, 1};
+      basit::index_range_iterator last{basit::index_range{k, k}, 1};
+      auto fut = for_each_device(
+          first, last, [&](const chunk_context& ctx, basit::index_range rng) -> xena::future<> {
+            ranges.emplace_back(rng.a(), rng.b());
+            return xena::future<int>{promises[ctx.chunk_index]}.then(
+                [chunk_index = ctx.chunk_index](int val) noexcept {
+                  SXT_RELEASE_ASSERT(val == chunk_index);
+                });
+          });
+      std::vector<std::pair<unsigned, unsigned>> expected;
+      for (unsigned i = 0; i < k; ++i) {
+        REQUIRE(!fut.ready());
+        promises[i].set_value(i);
+        expected.emplace_back(i, i + 1);
+      }
+      REQUIRE(fut.ready());
+      REQUIRE(ranges == expected);
+    }
+  }
+
+  SECTION("we can iterate over different chunks finished in an arbitrary order") {
+    std::mt19937 rng{0};
+
+    for (unsigned k = 3; k < 10; ++k) {
+      promises.clear();
+      promises.resize(k);
+      std::vector<bool> finished(k);
+      std::vector<xena::future<int>> futs;
+      for (auto& ps : promises) {
+        futs.emplace_back(ps);
+      }
+      basit::index_range_iterator first{basit::index_range{0, k}, 1};
+      basit::index_range_iterator last{basit::index_range{k, k}, 1};
+      auto fut = for_each_device(
+          first, last, [&](const chunk_context& ctx, basit::index_range rng) -> xena::future<> {
+            return futs[ctx.chunk_index].then(
+                [&finished, chunk_index = ctx.chunk_index](int val) noexcept {
+                  finished[chunk_index] = true;
+                  SXT_RELEASE_ASSERT(val == chunk_index);
+                });
+          });
+      std::vector<std::pair<unsigned, unsigned>> expected;
+      std::vector<unsigned> ix(k);
+      std::iota(ix.begin(), ix.end(), 0);
+      std::shuffle(ix.begin(), ix.end(), rng);
+      for (auto i : ix) {
+        REQUIRE(!fut.ready());
+        promises[i].set_value(i);
+        expected.emplace_back(i, i + 1);
+      }
+      REQUIRE(fut.ready());
+      REQUIRE(std::count(finished.begin(), finished.end(), true) == k);
+    }
+  }
+}