fix rms norm fusion causes delays in launching the mul-mat

am17an · am17an · commit 2c3cfa9b3c7b · 2025-11-09T23:10:05.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3173,12 +3173,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end());
                         const int stream_mapping = concurrent_event->stream_mapping[node];
                         cuda_ctx->curr_stream_no = stream_mapping;
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
                     }
                 }
-
-#ifdef GGML_CUDA_DEBUG
                 const int nodes_fused = i - prev_i - 1;
                 prev_i = i;
+
+#ifdef GGML_CUDA_DEBUG
                 if (nodes_fused > 0) {
                     GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
                 }
@@ -3459,16 +3460,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         continue;
                     }
 
+                    //TODO: fix this
+                    static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1;
+
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
-                        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
-                        i += 2;
-                        continue;
+                        if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
+                            ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
+                            i += 2;
+                            continue;
+                        }
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
-                        ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
+                        if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
+                            ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
+                            i++;
+                            continue;
+                        }
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
@@ -3506,7 +3514,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {
                         concurrent_event = &stream_ctx[adjusted_node];
 
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, adjusted_node->name);
 
                         cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
                         GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
@@ -3520,7 +3528,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         is_concurrent_event_active = true;
                     }
                 }
-                prev_i = i;
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -3173,12 +3173,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3173`	`3173`	`GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end());`
`3174`	`3174`	`const int stream_mapping = concurrent_event->stream_mapping[node];`
`3175`	`3175`	`cuda_ctx->curr_stream_no = stream_mapping;`
	`3176`	`+ GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);`
`3176`	`3177`	`}`
`3177`	`3178`	`}`
`3178`		`-`
`3179`		`-#ifdef GGML_CUDA_DEBUG`
`3180`	`3179`	`const int nodes_fused = i - prev_i - 1;`
`3181`	`3180`	`prev_i = i;`
	`3181`	`+`
	`3182`	`+#ifdef GGML_CUDA_DEBUG`
`3182`	`3183`	`if (nodes_fused > 0) {`
`3183`	`3184`	`GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);`
`3184`	`3185`	`}`
`@@ -3459,16 +3460,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3459`	`3460`	`continue;`
`3460`	`3461`	`}`
`3461`	`3462`
	`3463`	`+ //TODO: fix this`
	`3464`	`+ static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1;`
	`3465`	`+`
`3462`	`3466`	`if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {`
`3463`		`- ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);`
`3464`		`- i += 2;`
`3465`		`- continue;`
	`3467`	`+ if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 \|\| !graph_opt) {`
	`3468`	`+ ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);`
	`3469`	`+ i += 2;`
	`3470`	`+ continue;`
	`3471`	`+ }`
`3466`	`3472`	`}`
`3467`	`3473`
`3468`	`3474`	`if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {`
`3469`		`- ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);`
`3470`		`- i++;`
`3471`		`- continue;`
	`3475`	`+ if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 \|\| !graph_opt) {`
	`3476`	`+ ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);`
	`3477`	`+ i++;`
	`3478`	`+ continue;`
	`3479`	`+ }`
`3472`	`3480`	`}`
`3473`	`3481`
`3474`	`3482`	`if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {`
`@@ -3506,7 +3514,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3506`	`3514`	`if (stream_ctx.find(adjusted_node) != stream_ctx.end()) {`
`3507`	`3515`	`concurrent_event = &stream_ctx[adjusted_node];`
`3508`	`3516`
`3509`		`- GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);`
	`3517`	`+ GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, adjusted_node->name);`
`3510`	`3518`
`3511`	`3519`	`cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0`
`3512`	`3520`	`GGML_ASSERT(cuda_ctx->curr_stream_no == 0);`
`@@ -3520,7 +3528,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3520`	`3528`	`is_concurrent_event_active = true;`
`3521`	`3529`	`}`
`3522`	`3530`	`}`
`3523`		`- prev_i = i;`
`3524`	`3531`	`}`
`3525`	`3532`	`}`
`3526`	`3533`