[WIP][CUDA backend]: Async copy between host<->device

mergennachin · mergennachin · commit ae46c63b9c9c · 2025-12-02T12:08:41.000-08:00
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -200,6 +200,7 @@ class ET_EXPERIMENTAL CudaBackend final
       DelegateHandle* handle_,
       Span<EValue*> args) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    cudaStream_t stream = static_cast<cudaStream_t>(handle->cuda_stream);
 
     size_t n_inputs;
     handle->get_num_inputs(handle->container_handle, &n_inputs);
@@ -251,11 +252,11 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Copy data from CPU to GPU
+      // Async copy data from CPU to GPU
       ET_CHECK_OR_RETURN_ERROR(
-          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          aoti_torch_copy_async(gpu_inputs[i], cpu_tensor, stream) == Error::Ok,
           Internal,
-          "Failed to copy input %d from CPU to GPU",
+          "Failed to async copy input %d from CPU to GPU",
           i);
     }
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -288,6 +289,8 @@ class ET_EXPERIMENTAL CudaBackend final
       gpu_outputs[i] = gpu_output_handle;
     }
     // Run AOTI container with GPU tensors
+    // Note: kernel is queued on the same stream as H2D copies,
+    // so it will automatically wait for copies to complete
     AOTIRuntimeError error = handle->run(
         handle->container_handle,
         gpu_inputs.data(), // Use GPU input tensors
@@ -303,7 +306,7 @@ class ET_EXPERIMENTAL CudaBackend final
         "AOTInductorModelContainerRun failed with error code %d",
         error);
 
-    // Copy GPU output results back to CPU output tensors
+    // Async copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       // For DYNAMIC_BOUND tensors we try to resize
@@ -312,11 +315,15 @@ class ET_EXPERIMENTAL CudaBackend final
           "Error resizing tensor at output index %d",
           i);
       ET_CHECK_OK_OR_RETURN_ERROR(
-          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-          "Failed to copy GPU output %d back to CPU",
+          aoti_torch_copy_async(cpu_output_tensor, gpu_outputs[i], stream),
+          "Failed to async copy GPU output %d back to CPU",
           i);
     }
 
+    // Synchronize stream to ensure all async operations complete
+    // before returning to the caller
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamSynchronize(stream));
+
     return Error::Ok;
   }
 
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
@@ -582,6 +582,111 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
   return Error::Ok;
 }
 
+AOTITorchError
+aoti_torch_copy_async(Tensor* self, Tensor* src, cudaStream_t stream) {
+  // Check for null pointers first
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_copy_async failed: self tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      src != nullptr,
+      InvalidArgument,
+      "aoti_torch_copy_async failed: src tensor is null");
+
+  // Get dtype information and validate compatibility
+  int32_t self_dtype, src_dtype;
+  aoti_torch_get_dtype(self, &self_dtype);
+  aoti_torch_get_dtype(src, &src_dtype);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype));
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype));
+
+  // Check dtype compatibility - both tensors must have the same dtype
+  ET_CHECK_OR_RETURN_ERROR(
+      self_dtype == src_dtype,
+      InvalidArgument,
+      "dtype mismatch. self.dtype=%d, src.dtype=%d. aoti_torch_copy_async requires same dtypes",
+      self_dtype,
+      src_dtype);
+
+  // Check total number of elements compatibility
+  int64_t self_numel = self->numel();
+  int64_t src_numel = src->numel();
+
+  ET_CHECK_OR_RETURN_ERROR(
+      self_numel == src_numel,
+      InvalidArgument,
+      "numel mismatch. self.numel()=%ld, src.numel()=%ld",
+      self_numel,
+      src_numel);
+
+  // Get tensor metadata
+  int64_t* self_strides;
+  int64_t* src_strides;
+  aoti_torch_get_strides(self, &self_strides);
+  aoti_torch_get_strides(src, &src_strides);
+
+  // Check if tensors have the same strides (required for async copy)
+  bool same_strides = true;
+  for (int i = 0; i < self->dim(); i++) {
+    if (self_strides[i] != src_strides[i]) {
+      same_strides = false;
+      break;
+    }
+  }
+
+  ET_CHECK_OR_RETURN_ERROR(
+      same_strides,
+      InvalidArgument,
+      "aoti_torch_copy_async requires tensors with same strides. Use aoti_torch_copy_ for non-contiguous tensors");
+
+  // Determine device locations
+  cudaPointerAttributes srcAttributes{};
+  cudaPointerAttributes dstAttributes{};
+
+  ET_CUDA_CHECK_OR_RETURN_ERROR(
+      cudaPointerGetAttributes(&srcAttributes, src->data_ptr()));
+
+  ET_CUDA_CHECK_OR_RETURN_ERROR(
+      cudaPointerGetAttributes(&dstAttributes, self->data_ptr()));
+
+  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
+  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
+
+  size_t total_bytes = src->nbytes();
+
+  // Determine copy direction and perform async copy
+  if (srcIsDevice && dstIsDevice) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpyAsync(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        total_bytes,
+        cudaMemcpyDeviceToDevice,
+        stream));
+  } else if (srcIsDevice && !dstIsDevice) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpyAsync(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        total_bytes,
+        cudaMemcpyDeviceToHost,
+        stream));
+  } else if (!srcIsDevice && dstIsDevice) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpyAsync(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        total_bytes,
+        cudaMemcpyHostToDevice,
+        stream));
+  } else {
+    // Host to host - use regular memcpy (no async benefit)
+    std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes);
+  }
+
+  return Error::Ok;
+}
+
 AOTITorchError aoti_torch__reinterpret_tensor(
     Tensor* self,
     int64_t ndim,
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
@@ -140,6 +140,31 @@ AOTI_SHIM_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
 AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 
+/**
+ * Asynchronously copies data from source tensor to destination tensor.
+ *
+ * This function performs an asynchronous memory copy between tensors using
+ * cudaMemcpyAsync. The copy is queued on the specified CUDA stream and returns
+ * immediately without waiting for completion. The caller must synchronize the
+ * stream before accessing the destination data.
+ *
+ * Requirements:
+ * - Both tensors must have the same dtype and number of elements
+ * - Both tensors must be contiguous (same strides)
+ * - For non-contiguous tensors, use aoti_torch_copy_ instead
+ *
+ * @param self Destination tensor (data will be overwritten)
+ * @param src Source tensor (data will be copied from this tensor)
+ * @param stream CUDA stream on which to queue the async copy
+ *
+ * @return Error::Ok on success, appropriate error code on failure:
+ *         - Error::InvalidArgument: null pointers, dtype mismatch, numel
+ *           mismatch, or non-contiguous tensors
+ *         - Error::Internal: CUDA operation failures
+ */
+AOTI_SHIM_EXPORT AOTITorchError
+aoti_torch_copy_async(Tensor* self, Tensor* src, cudaStream_t stream);
+
 /**
  * Creates a new tensor handle from an existing one.
  *