ggml-cuda: fix bug w.r.t first stream launch

am17an · am17an · commit 8729fcefdea2 · 2025-11-13T16:23:25.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3153,6 +3153,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             [[maybe_unused]] int prev_i = 0;
 
             ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
+
             if (stream_ctx.concurrent_events.size() > 0) {
                 cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());
             }
@@ -3186,7 +3187,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) {
                         concurrent_event = &stream_ctx.concurrent_events[prev_node];
 
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name);
 
                         cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
                         GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
@@ -3198,6 +3199,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         }
 
                         is_concurrent_event_active = true;
+                        cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                     }
                 }
                 prev_i = i;

Original file line number	Diff line number	Diff line change
`@@ -3153,6 +3153,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3153`	`3153`	`[[maybe_unused]] int prev_i = 0;`
`3154`	`3154`
`3155`	`3155`	`ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();`
	`3156`	`+`
`3156`	`3157`	`if (stream_ctx.concurrent_events.size() > 0) {`
`3157`	`3158`	`cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());`
`3158`	`3159`	`}`
`@@ -3186,7 +3187,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3186`	`3187`	`if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) {`
`3187`	`3188`	`concurrent_event = &stream_ctx.concurrent_events[prev_node];`
`3188`	`3189`
`3189`		`- GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);`
	`3190`	`+ GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name);`
`3190`	`3191`
`3191`	`3192`	`cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0`
`3192`	`3193`	`GGML_ASSERT(cuda_ctx->curr_stream_no == 0);`
`@@ -3198,6 +3199,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx`
`3198`	`3199`	`}`
`3199`	`3200`
`3200`	`3201`	`is_concurrent_event_active = true;`
	`3202`	`+ cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];`
	`3203`	`+ GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);`
`3201`	`3204`	`}`
`3202`	`3205`	`}`
`3203`	`3206`	`prev_i = i;`