maybe something

am17an · am17an · commit 050b0c2a7bda · 2025-11-07T21:31:18.000+08:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -968,6 +968,11 @@ struct ggml_cuda_concurrent_event {
 
     const ggml_tensor * join_node;
 
+    int join_node_idx = 0;
+    int fork_node_idx = 0;
+
+    std::vector<const ggml_tensor *> nodes;
+
     ggml_cuda_concurrent_event() = default;
 
     explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2998,7 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 #ifndef NDEBUG
     const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
     GGML_ASSERT(unary_ops.size() == num_unary);
-#endif
+#endif;
 
     //TODO: remove special case once ggml_can_fuse can handle empty nodes
     std::initializer_list<enum ggml_op> topk_moe_ops =
@@ -3139,6 +3139,29 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     return false;
 }
 
+static void reorder_nodes_for_stream_fusion(ggml_backend_cuda_context * cuda_ctx, ggml_tensor ** data, int n_nodes) {
+    for (const auto & [fork_node, event] : cuda_ctx->stream_context()) {
+
+        const int fork_node_idx = event.fork_node_idx;
+        const int join_node_idx = event.join_node_idx;
+
+        for (int i = fork_node_idx + 1, k = 0; i <= join_node_idx - 1 ; i++, k++) {
+            data[i] = const_cast<ggml_tensor *>(event.nodes[k]);
+        }
+    }
+    for (const auto & [fork_node, event] : cuda_ctx->stream_context()) {
+        bool found = false;
+        for (int i = 0; i < n_nodes; ++i) {
+            if (data[i] == fork_node) {
+                found = true;
+                break;
+            }
+        }
+
+        GGML_ASSERT(found);
+    }
+}
+
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
     // flag used to determine whether it is an integrated_gpu
@@ -3153,6 +3176,22 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         if (!use_cuda_graph || cuda_graph_update_required) {
             [[maybe_unused]] int prev_i = 0;
 
+            ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
+            GGML_LOG_DEBUG("Stream ctx size: %d\n", stream_ctx.size());
+            ggml_tensor ** orig_data = cgraph->nodes;
+            std::vector<ggml_tensor *> orig_graph;
+            orig_graph.resize(cgraph->n_nodes);
+            if (cuda_graph_update_required) {
+                //we are capturing so we can actually re-order
+                for(int i = 0; i < cgraph->n_nodes; ++i) {
+                    orig_graph[i] = cgraph->nodes[i];
+                }
+                reorder_nodes_for_stream_fusion(cuda_ctx, orig_graph.data(), cgraph->n_nodes);
+                GGML_LOG_DEBUG("Reordered CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
+                cgraph->nodes = orig_graph.data();
+            }
+
+
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
                 if (is_concurrent_event_active) {
@@ -3188,7 +3227,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
-                ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
 
                 // start of fusion operations
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
@@ -3497,31 +3535,33 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 GGML_ASSERT(ok);
 
                 if (!is_concurrent_event_active) {
-                    const ggml_tensor * adjusted_node = node;
+                    //const ggml_tensor * adjusted_node = node;
                     // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD),
                     // we can safely use the previous node to check if it can be forked
-                    if (i - prev_i > 1) {
-                        adjusted_node = cgraph->nodes[i - 1];
-                    }
-                    if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {
-                        concurrent_event = &stream_ctx[adjusted_node];
+                    for (int k = prev_i +1 ; k < i; ++k) {
+                        const ggml_tensor * adjusted_node = cgraph->nodes[k];
+                        if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {
+                            concurrent_event = &stream_ctx[adjusted_node];
 
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+                            GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
 
-                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
-                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
-                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
+                            cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
+                            GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
+                            CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
 
-                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
-                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
-                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
-                        }
+                            for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                                cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
+                                CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
+                            }
 
-                        is_concurrent_event_active = true;
+                            is_concurrent_event_active = true;
+                        }
+    
                     }
-                }
+               }
                 prev_i = i;
             }
+            cgraph->nodes = orig_data;
         }
 
 #ifdef USE_CUDA_GRAPH
@@ -3673,7 +3713,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     }
 
     GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU");
-    GGML_LOG_DEBUG("Optimizing CUDA graph\n");
+    GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
 
     ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
     stream_context.clear();
@@ -3833,8 +3873,17 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                     continue;
                 }
 
+                for (const auto & branch: nodes_per_branch) {
+                    for(const ggml_tensor * n: branch) {
+                        concurrent_event.nodes.push_back(n);
+                    }
+                }
+                concurrent_event.fork_node_idx = fork_node_idx;
+                concurrent_event.join_node_idx = join_node_idx;
+
                 GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end());
                 cuda_ctx->stream_context().emplace(root_node, concurrent_event);
+                GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
                 concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
 
                 // interleave tensors to extend lifetimes so that ggml graph doesn't recycle them
@@ -3850,12 +3899,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                         has_node |= branch_node.size() > 0;
                     }
 
-                    if (!has_node) {
-                        printf("Skipping %s because it is empty %s\n", cgraph->nodes[current_node_idx]->name,
-                               ggml_op_name(cgraph->nodes[current_node_idx]->op));
-                        current_node_idx++;
-                        continue;
-                    }
+                    GGML_ASSERT(has_node);
 
                     if (branch_nodes.empty()) {
                         current_branch_idx = (current_branch_idx + 1) % n_branches;