properly sync to stream

agray3 · agray3 · commit 6d7df91962e3 · 2025-03-31T08:19:34.000-07:00
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
@@ -347,15 +347,16 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
 
 // Copy destination pointers to GPU to be available when pointer indirection is in use
 
-void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size) {
+void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
     if(cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
         if (cuda_graph->dest_ptrs_d != nullptr) cudaFree(cuda_graph->dest_ptrs_d);
+        cudaStreamSynchronize(stream);
         cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *));
         cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
     }
     // copy destination pointers to GPU
-    cudaMemcpy(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream);
     cuda_graph->graph_cpynode_index = 0; // reset index
 #endif
 }
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
@@ -8,4 +8,4 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
 
-void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size);
+void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2486,7 +2486,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
     if(use_cuda_graph)
     {
         // copy pointers to GPU so they can be accessed via indirection within CUDA graph
-        ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size());
+        ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
     }
 
     return use_cuda_graph;

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`
`8`	`8`
`9`	`9`	`void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);`
`10`	`10`
`11`		`-void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size);`
	`11`	`+void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);`
Original file line number	Diff line number	Diff line change
`@@ -2486,7 +2486,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud`
`2486`	`2486`	`if(use_cuda_graph)`
`2487`	`2487`	`{`
`2488`	`2488`	`// copy pointers to GPU so they can be accessed via indirection within CUDA graph`
`2489`		`- ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size());`
	`2489`	`+ ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());`
`2490`	`2490`	`}`
`2491`	`2491`
`2492`	`2492`	`return use_cuda_graph;`