ggml: improve ggml_backend_cuda_cpy_tensor_async

koush · koush · commit 5225eaa43b00 · 2025-05-26T20:53:21.000-07:00
Make device to device actually async; right now it syncs on dst.
Implement host to device async.
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -1394,6 +1394,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                     }
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
+                else {
+                    if (input_backend->iface.synchronize) {
+                        // async copy succeeded, need to synchronize the input backend to ensure the copy is done before the split backend uses it
+                        input_backend->iface.synchronize(input_backend);
+                    }
+                }
             }
         }
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2404,22 +2404,42 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
     ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
     ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
 
-    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
-        return false;
+    bool src_is_cuda = ggml_backend_is_cuda(backend_src);
+    if (src_is_cuda) {
+         if (!ggml_backend_buffer_is_cuda(buf_src)) {
+            return false;
+         }
+    }
+
+    bool dst_is_cuda = ggml_backend_is_cuda(backend_dst);
+    if (dst_is_cuda) {
+        if (!ggml_backend_buffer_is_cuda(buf_dst)) {
+            return false;
+        }
     }
 
-    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
+    bool src_is_host = !src_is_cuda && ggml_backend_buffer_is_host(buf_src);
+    bool dst_is_host = !dst_is_cuda && ggml_backend_buffer_is_host(buf_dst);
+
+    // async copy supports cuda to cuda, cuda to host, and host to cuda.
+    if (!src_is_cuda && !dst_is_cuda) {
+        return false;
+    }
+    else if (src_is_host && !dst_is_cuda) {
+        return false;
+    }
+    else if (dst_is_host && !src_is_cuda) {
         return false;
     }
 
     // device -> device copy
-    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
-    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
+    ggml_backend_cuda_context * cuda_ctx_src = src_is_cuda ? (ggml_backend_cuda_context *)backend_src->context : nullptr;
+    ggml_backend_cuda_context * cuda_ctx_dst = dst_is_cuda ? (ggml_backend_cuda_context *)backend_dst->context : nullptr;
 
     ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
     ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
 
-    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
+    if ((cuda_ctx_src && cuda_ctx_src->device != buf_ctx_src->device) || (cuda_ctx_dst && cuda_ctx_dst->device != buf_ctx_dst->device)) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
 #endif
@@ -2428,7 +2448,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 
     if (backend_src != backend_dst) {
         // copy on src stream
-        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+        if (src_is_host) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
+        } else if (dst_is_host) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToHost, cuda_ctx_src->stream()));
+        } else if (cuda_ctx_src->device == cuda_ctx_dst->device) {
             CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
         } else {
 #ifdef GGML_CUDA_NO_PEER_COPY
@@ -2438,16 +2462,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 #endif
         }
 
-        // record event on src stream after the copy
-        if (!cuda_ctx_src->copy_event) {
-            ggml_cuda_set_device(cuda_ctx_src->device);
-            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
-        }
-
-        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
+        if (cuda_ctx_src) {
+            if (!cuda_ctx_src->copy_event) {
+                ggml_cuda_set_device(cuda_ctx_src->device);
+                CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
+            }
 
-        // wait on dst stream for the copy to complete
-        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
+            CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
+        }
     } else {
         // src and dst are on the same backend
         CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));

Original file line number	Diff line number	Diff line change
`@@ -1394,6 +1394,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s`
`1394`	`1394`	`}`
`1395`	`1395`	`ggml_backend_tensor_copy(input, input_cpy);`
`1396`	`1396`	`}`
	`1397`	`+ else {`
	`1398`	`+ if (input_backend->iface.synchronize) {`
	`1399`	`+ // async copy succeeded, need to synchronize the input backend to ensure the copy is done before the split backend uses it`
	`1400`	`+ input_backend->iface.synchronize(input_backend);`
	`1401`	`+ }`
	`1402`	`+ }`
`1397`	`1403`	`}`
`1398`	`1404`	`}`
`1399`	`1405`