ggml: change cpy_tensor_async (cuda/cann) to run on the dst stream

koush · koush · commit daa3f79529b4 · 2025-05-27T15:17:31.000-07:00
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1383,23 +1383,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                 } else {
                     ggml_backend_synchronize(split_backend);
                 }
-                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
-                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
+                ggml_backend_synchronize(input_backend);
                 if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
-                    ggml_backend_synchronize(input_backend);
                     if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                         ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                     } else {
                         ggml_backend_synchronize(split_backend);
                     }
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
-                else {
-                    if (input_backend->iface.synchronize) {
-                        // async copy succeeded, need to synchronize the input backend to ensure the copy is done before the split backend uses it
-                        input_backend->iface.synchronize(input_backend);
-                    }
-                }
             }
         }
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1913,10 +1913,7 @@ static bool ggml_backend_cann_cpy_tensor_async(
         cann_ctx_src->task_queue.wait();
         ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
                                    ACL_MEMCPY_DEVICE_TO_DEVICE,
-                                   cann_ctx_src->stream()));
-
-        //TODO: workaround for Event didn`t work here.
-        aclrtSynchronizeStream(cann_ctx_src->stream());
+                                   cann_ctx_dst->stream()));
     } else {
         // src and dst are on the same backend
         ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2460,7 +2460,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 #ifdef GGML_CUDA_NO_PEER_COPY
             return false;
 #else
-            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_dst->stream()));
 #endif
         }
 
@@ -2474,7 +2474,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
         }
     } else {
         // src and dst are on the same backend
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
     }
     return true;
 }

Original file line number	Diff line number	Diff line change
`@@ -1383,23 +1383,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s`
`1383`	`1383`	`} else {`
`1384`	`1384`	`ggml_backend_synchronize(split_backend);`
`1385`	`1385`	`}`
`1386`		`- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events`
`1387`		`- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface`
	`1386`	`+ ggml_backend_synchronize(input_backend);`
`1388`	`1387`	`if (!split_backend->iface.cpy_tensor_async \|\| !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {`
`1389`		`- ggml_backend_synchronize(input_backend);`
`1390`	`1388`	`if (sched->events[split_backend_id][sched->cur_copy] != NULL) {`
`1391`	`1389`	`ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);`
`1392`	`1390`	`} else {`
`1393`	`1391`	`ggml_backend_synchronize(split_backend);`
`1394`	`1392`	`}`
`1395`	`1393`	`ggml_backend_tensor_copy(input, input_cpy);`
`1396`	`1394`	`}`
`1397`		`- else {`
`1398`		`- if (input_backend->iface.synchronize) {`
`1399`		`- // async copy succeeded, need to synchronize the input backend to ensure the copy is done before the split backend uses it`
`1400`		`- input_backend->iface.synchronize(input_backend);`
`1401`		`- }`
`1402`		`- }`
`1403`	`1395`	`}`
`1404`	`1396`	`}`
`1405`	`1397`
Original file line number	Diff line number	Diff line change
`@@ -2460,7 +2460,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_`
`2460`	`2460`	`#ifdef GGML_CUDA_NO_PEER_COPY`
`2461`	`2461`	`return false;`
`2462`	`2462`	`#else`
`2463`		`- CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));`
	`2463`	`+ CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_dst->stream()));`
`2464`	`2464`	`#endif`
`2465`	`2465`	`}`
`2466`	`2466`
`@@ -2474,7 +2474,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_`
`2474`	`2474`	`}`
`2475`	`2475`	`} else {`
`2476`	`2476`	`// src and dst are on the same backend`
`2477`		`- CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));`
	`2477`	`+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));`
`2478`	`2478`	`}`
`2479`	`2479`	`return true;`
`2480`	`2480`	`}`