Revert "maybe something"

am17an · am17an · commit a5afc0c4fe04 · 2025-11-07T21:31:18.000+08:00
This reverts commit 12e36175a12d3b556fcf6b8b2eb3a2ccdca13d66.
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -968,11 +968,6 @@ struct ggml_cuda_concurrent_event {
 
     const ggml_tensor * join_node;
 
-    int join_node_idx = 0;
-    int fork_node_idx = 0;
-
-    std::vector<const ggml_tensor *> nodes;
-
     ggml_cuda_concurrent_event() = default;
 
     explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2998,7 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 #ifndef NDEBUG
     const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
     GGML_ASSERT(unary_ops.size() == num_unary);
-#endif;
+#endif
 
     //TODO: remove special case once ggml_can_fuse can handle empty nodes
     std::initializer_list<enum ggml_op> topk_moe_ops =
@@ -3139,29 +3139,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     return false;
 }
 
-static void reorder_nodes_for_stream_fusion(ggml_backend_cuda_context * cuda_ctx, ggml_tensor ** data, int n_nodes) {
-    for (const auto & [fork_node, event] : cuda_ctx->stream_context()) {
-
-        const int fork_node_idx = event.fork_node_idx;
-        const int join_node_idx = event.join_node_idx;
-
-        for (int i = fork_node_idx + 1, k = 0; i <= join_node_idx - 1 ; i++, k++) {
-            data[i] = const_cast<ggml_tensor *>(event.nodes[k]);
-        }
-    }
-    for (const auto & [fork_node, event] : cuda_ctx->stream_context()) {
-        bool found = false;
-        for (int i = 0; i < n_nodes; ++i) {
-            if (data[i] == fork_node) {
-                found = true;
-                break;
-            }
-        }
-
-        GGML_ASSERT(found);
-    }
-}
-
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
     // flag used to determine whether it is an integrated_gpu
@@ -3176,22 +3153,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         if (!use_cuda_graph || cuda_graph_update_required) {
             [[maybe_unused]] int prev_i = 0;
 
-            ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
-            GGML_LOG_DEBUG("Stream ctx size: %d\n", stream_ctx.size());
-            ggml_tensor ** orig_data = cgraph->nodes;
-            std::vector<ggml_tensor *> orig_graph;
-            orig_graph.resize(cgraph->n_nodes);
-            if (cuda_graph_update_required) {
-                //we are capturing so we can actually re-order
-                for(int i = 0; i < cgraph->n_nodes; ++i) {
-                    orig_graph[i] = cgraph->nodes[i];
-                }
-                reorder_nodes_for_stream_fusion(cuda_ctx, orig_graph.data(), cgraph->n_nodes);
-                GGML_LOG_DEBUG("Reordered CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
-                cgraph->nodes = orig_graph.data();
-            }
-
-
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
                 if (is_concurrent_event_active) {
@@ -3227,6 +3188,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
+                ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
 
                 // start of fusion operations
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
@@ -3535,33 +3497,31 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 GGML_ASSERT(ok);
 
                 if (!is_concurrent_event_active) {
-                    //const ggml_tensor * adjusted_node = node;
+                    const ggml_tensor * adjusted_node = node;
                     // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD),
                     // we can safely use the previous node to check if it can be forked
-                    for (int k = prev_i +1 ; k < i; ++k) {
-                        const ggml_tensor * adjusted_node = cgraph->nodes[k];
-                        if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {
-                            concurrent_event = &stream_ctx[adjusted_node];
+                    if (i - prev_i > 1) {
+                        adjusted_node = cgraph->nodes[i - 1];
+                    }
+                    if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {
+                        concurrent_event = &stream_ctx[adjusted_node];
 
-                            GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
 
-                            cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
-                            GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
-                            CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
+                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
+                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
+                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
 
-                            for (int i = 1; i <= concurrent_event->n_streams; ++i) {
-                                cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
-                                CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
-                            }
-
-                            is_concurrent_event_active = true;
+                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
+                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
                         }
-    
+
+                        is_concurrent_event_active = true;
                     }
-               }
+                }
                 prev_i = i;
             }
-            cgraph->nodes = orig_data;
         }
 
 #ifdef USE_CUDA_GRAPH
@@ -3713,7 +3673,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     }
 
     GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU");
-    GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
+    GGML_LOG_DEBUG("Optimizing CUDA graph\n");
 
     ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
     stream_context.clear();
@@ -3873,17 +3833,8 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                     continue;
                 }
 
-                for (const auto & branch: nodes_per_branch) {
-                    for(const ggml_tensor * n: branch) {
-                        concurrent_event.nodes.push_back(n);
-                    }
-                }
-                concurrent_event.fork_node_idx = fork_node_idx;
-                concurrent_event.join_node_idx = join_node_idx;
-
                 GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end());
                 cuda_ctx->stream_context().emplace(root_node, concurrent_event);
-                GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
                 concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
 
                 // interleave tensors to extend lifetimes so that ggml graph doesn't recycle them
@@ -3899,7 +3850,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                         has_node |= branch_node.size() > 0;
                     }
 
-                    GGML_ASSERT(has_node);
+                    if (!has_node) {
+                        printf("Skipping %s because it is empty %s\n", cgraph->nodes[current_node_idx]->name,
+                               ggml_op_name(cgraph->nodes[current_node_idx]->op));
+                        current_node_idx++;
+                        continue;
+                    }
 
                     if (branch_nodes.empty()) {
                         current_branch_idx = (current_branch_idx + 1) % n_branches;