remvoing shuffle nb from header

nirandaperera · nirandaperera · commit 141b711a564f · 2025-09-04T12:25:29.000-07:00
Signed-off-by: niranda perera &lt;niranda.perera@gmail.com&gt;
diff --git a/cpp/include/rapidsmpf/streaming/cudf/shuffler.hpp b/cpp/include/rapidsmpf/streaming/cudf/shuffler.hpp
@@ -47,35 +47,4 @@ Node shuffler(
     shuffler::Shuffler::PartitionOwner partition_owner = shuffler::Shuffler::round_robin
 );
 
-/**
- * @brief Launches a non-blocking shuffler node for a single shuffle operation.
- *
- * This is a non-blocking version of `shuffler` that returns a pair of nodes. The first
- * node inserts the partition map chunks into the shuffler and the second node extracts
- * the packed chunks from the shuffler and sends them to the output channel.
- *
- * @param ctx The streaming context providing communication, memory, stream, and execution
- * resources.
- * @param stream The CUDA stream on which to perform the shuffling. If chunks from the
- * input channel aren't created on `stream`, the streams are all synchronized.
- * @param ch_in Input channel providing packed partition chunks to be shuffled.
- * @param ch_out Output channel where the shuffled results are sent.
- * @param op_id Unique operation ID for this shuffle. Must not be reused until all nodes
- * have called `Shuffler::shutdown()`.
- * @param total_num_partitions Total number of partitions to shuffle the data into.
- * @param partition_owner Function that maps a partition ID to its owning rank/node.
- *
- * @return A pair of nodes that complete when the shuffling has finished and the output
- * channel is drained.
- */
-std::pair<Node, Node> shuffler_nb(
-    std::shared_ptr<Context> ctx,
-    rmm::cuda_stream_view stream,
-    std::shared_ptr<Channel> ch_in,
-    std::shared_ptr<Channel> ch_out,
-    OpID op_id,
-    shuffler::PartID total_num_partitions,
-    shuffler::Shuffler::PartitionOwner partition_owner = shuffler::Shuffler::round_robin
-);
-
 }  // namespace rapidsmpf::streaming::node
diff --git a/cpp/src/streaming/cudf/shuffler.cpp b/cpp/src/streaming/cudf/shuffler.cpp
@@ -99,113 +99,4 @@ Node shuffler(
     co_await ch_out->drain(ctx->executor());
 }
 
-std::pair<Node, Node> shuffler_nb(
-    std::shared_ptr<Context> ctx,
-    rmm::cuda_stream_view stream,
-    std::shared_ptr<Channel> ch_in,
-    std::shared_ptr<Channel> ch_out,
-    OpID op_id,
-    shuffler::PartID total_num_partitions,
-    shuffler::Shuffler::PartitionOwner partition_owner
-) {
-    // make a shared_ptr to the shuffler so that it can be passed into multiple coroutines
-    auto shuffler = std::make_shared<rapidsmpf::shuffler::Shuffler>(
-        ctx->comm(),
-        ctx->progress_thread(),
-        op_id,
-        total_num_partitions,
-        stream,
-        ctx->br(),
-        ctx->statistics(),
-        std::move(partition_owner)
-    );
-
-    // insert task: insert the partition map chunks into the shuffler
-    auto insert_task =
-        [](
-            auto shuffler, auto ctx, auto total_num_partitions, auto stream, auto ch_in
-        ) -> Node {
-        ShutdownAtExit c{ch_in};
-        co_await ctx->executor()->schedule();
-        CudaEvent event;
-
-        while (true) {
-            auto msg = co_await ch_in->receive();
-            if (msg.empty()) {
-                break;
-            }
-            auto partition_map = msg.template release<PartitionMapChunk>();
-
-            // Make sure that the input chunk's stream is in sync with shuffler's stream.
-            sync_streams(stream, partition_map.stream, event);
-
-            shuffler->insert(std::move(partition_map.data));
-        }
-
-        // Tell the shuffler that we have no more input data.
-        std::vector<rapidsmpf::shuffler::PartID> finished(total_num_partitions);
-        std::iota(finished.begin(), finished.end(), 0);
-        shuffler->insert_finished(std::move(finished));
-        co_return;
-    };
-
-    // extract task: extract the packed chunks from the shuffler and send them to the
-    // output channel
-    auto extract_task = [](auto shuffler, auto ctx, auto ch_out) -> Node {
-        ShutdownAtExit c{ch_out};
-        co_await ctx->executor()->schedule();
-
-        coro::mutex mtx{};
-        coro::condition_variable cv{};
-        bool finished{false};
-
-        shuffler->register_finished_callback(
-            [shuffler, ctx, ch_out, &mtx, &cv, &finished](auto pid) {
-                // task to extract and send each finished partition
-                auto extract_and_send = [](auto shuffler,
-                                           auto ctx,
-                                           auto ch_out,
-                                           auto pid,
-                                           coro::condition_variable& cv,
-                                           coro::mutex& mtx,
-                                           bool& finished) -> Node {
-                    co_await ctx->executor()->schedule();
-                    auto packed_chunks = shuffler->extract(pid);
-                    co_await ch_out->send(
-                        std::make_unique<PartitionVectorChunk>(
-                            pid, std::move(packed_chunks)
-                        )
-                    );
-
-                    // signal that all partitions have been finished
-                    if (shuffler->finished()) {
-                        {
-                            auto lock = co_await mtx.scoped_lock();
-                            finished = true;
-                        }
-                        co_await cv.notify_one();
-                    }
-                };
-                // schedule a detached task to extract and send the packed chunks
-                ctx->executor()->spawn(
-                    extract_and_send(shuffler, ctx, ch_out, pid, cv, mtx, finished)
-                );
-            }
-        );
-
-        // wait for all partitions to be finished
-        {
-            auto lock = co_await mtx.scoped_lock();
-            co_await cv.wait(lock, [&finished]() { return finished; });
-        }
-
-        co_await ch_out->drain(ctx->executor());
-    };
-
-    return {
-        insert_task(shuffler, ctx, total_num_partitions, stream, std::move(ch_in)),
-        extract_task(std::move(shuffler), std::move(ctx), std::move(ch_out))
-    };
-}
-
 }  // namespace rapidsmpf::streaming::node
diff --git a/cpp/tests/streaming/test_shuffler.cpp b/cpp/tests/streaming/test_shuffler.cpp
@@ -98,31 +98,159 @@ class StreamingShuffler : public BaseStreamingFixture {
 };
 
 TEST_F(StreamingShuffler, Basic) {
-    run_test([&](auto ctx, auto ch_in, auto ch_out, std::vector<Node>& nodes) {
-        nodes.emplace_back(
-            node::shuffler(
+    EXPECT_NO_FATAL_FAILURE(
+        run_test([&](auto ctx, auto ch_in, auto ch_out, std::vector<Node>& nodes) {
+            nodes.emplace_back(
+                node::shuffler(
+                    std::move(ctx),
+                    stream,
+                    std::move(ch_in),
+                    std::move(ch_out),
+                    op_id,
+                    num_partitions
+                )
+            );
+        })
+    );
+}
+
+namespace {
+
+void sync_streams(
+    rmm::cuda_stream_view primary,
+    rmm::cuda_stream_view secondary,
+    cudaEvent_t const& event
+) {
+    if (primary.value() != secondary.value()) {
+        RAPIDSMPF_CUDA_TRY(cudaEventRecord(event, secondary));
+        RAPIDSMPF_CUDA_TRY(cudaStreamWaitEvent(primary, event));
+    }
+}
+
+// emulate shuffler node with callbacks
+std::pair<Node, Node> shuffler_nb(
+    std::shared_ptr<Context> ctx,
+    rmm::cuda_stream_view stream,
+    std::shared_ptr<Channel> ch_in,
+    std::shared_ptr<Channel> ch_out,
+    OpID op_id,
+    shuffler::PartID total_num_partitions
+) {
+    // make a shared_ptr to the shuffler so that it can be passed into multiple coroutines
+    auto shuffler = std::make_shared<rapidsmpf::shuffler::Shuffler>(
+        ctx->comm(),
+        ctx->progress_thread(),
+        op_id,
+        total_num_partitions,
+        stream,
+        ctx->br(),
+        ctx->statistics(),
+        shuffler::Shuffler::round_robin
+    );
+
+    // insert task: insert the partition map chunks into the shuffler
+    auto insert_task =
+        [](
+            auto shuffler, auto ctx, auto total_num_partitions, auto stream, auto ch_in
+        ) -> Node {
+        ShutdownAtExit c{ch_in};
+        co_await ctx->executor()->schedule();
+        CudaEvent event;
+
+        while (true) {
+            auto msg = co_await ch_in->receive();
+            if (msg.empty()) {
+                break;
+            }
+            auto partition_map = msg.template release<PartitionMapChunk>();
+
+            // Make sure that the input chunk's stream is in sync with shuffler's stream.
+            sync_streams(stream, partition_map.stream, event);
+
+            shuffler->insert(std::move(partition_map.data));
+        }
+
+        // Tell the shuffler that we have no more input data.
+        std::vector<rapidsmpf::shuffler::PartID> finished(total_num_partitions);
+        std::iota(finished.begin(), finished.end(), 0);
+        shuffler->insert_finished(std::move(finished));
+        co_return;
+    };
+
+    // extract task: extract the packed chunks from the shuffler and send them to the
+    // output channel
+    auto extract_task = [](auto shuffler, auto ctx, auto ch_out) -> Node {
+        ShutdownAtExit c{ch_out};
+        co_await ctx->executor()->schedule();
+
+        coro::mutex mtx{};
+        coro::condition_variable cv{};
+        bool finished{false};
+
+        shuffler->register_finished_callback(
+            [shuffler, ctx, ch_out, &mtx, &cv, &finished](auto pid) {
+                // task to extract and send each finished partition
+                auto extract_and_send = [](auto shuffler,
+                                           auto ctx,
+                                           auto ch_out,
+                                           auto pid,
+                                           coro::condition_variable& cv,
+                                           coro::mutex& mtx,
+                                           bool& finished) -> Node {
+                    co_await ctx->executor()->schedule();
+                    auto packed_chunks = shuffler->extract(pid);
+                    co_await ch_out->send(
+                        std::make_unique<PartitionVectorChunk>(
+                            pid, std::move(packed_chunks)
+                        )
+                    );
+
+                    // signal that all partitions have been finished
+                    if (shuffler->finished()) {
+                        {
+                            auto lock = co_await mtx.scoped_lock();
+                            finished = true;
+                        }
+                        co_await cv.notify_one();
+                    }
+                };
+                // schedule a detached task to extract and send the packed chunks
+                ctx->executor()->spawn(
+                    extract_and_send(shuffler, ctx, ch_out, pid, cv, mtx, finished)
+                );
+            }
+        );
+
+        // wait for all partitions to be finished
+        {
+            auto lock = co_await mtx.scoped_lock();
+            co_await cv.wait(lock, [&finished]() { return finished; });
+        }
+
+        co_await ch_out->drain(ctx->executor());
+    };
+
+    return {
+        insert_task(shuffler, ctx, total_num_partitions, stream, std::move(ch_in)),
+        extract_task(std::move(shuffler), std::move(ctx), std::move(ch_out))
+    };
+}
+
+}  // namespace
+
+TEST_F(StreamingShuffler, callbacks) {
+    EXPECT_NO_FATAL_FAILURE(
+        run_test([&](auto ctx, auto ch_in, auto ch_out, std::vector<Node>& nodes) {
+            auto [insert_node, extract_node] = shuffler_nb(
                 std::move(ctx),
                 stream,
                 std::move(ch_in),
                 std::move(ch_out),
                 op_id,
                 num_partitions
-            )
-        );
-    });
-}
-
-TEST_F(StreamingShuffler, callbacks) {
-    run_test([&](auto ctx, auto ch_in, auto ch_out, std::vector<Node>& nodes) {
-        auto [insert_node, extract_node] = node::shuffler_nb(
-            std::move(ctx),
-            stream,
-            std::move(ch_in),
-            std::move(ch_out),
-            op_id,
-            num_partitions
-        );
-        nodes.emplace_back(std::move(insert_node));
-        nodes.emplace_back(std::move(extract_node));
-    });
+            );
+            nodes.emplace_back(std::move(insert_node));
+            nodes.emplace_back(std::move(extract_node));
+        })
+    );
 }