Skip to content

Commit daa3f79

Browse files
committed
ggml: change cpy_tensor_async (cuda/cann) to run on the dst stream
1 parent 1984136 commit daa3f79

File tree

3 files changed

+4
-15
lines changed

3 files changed

+4
-15
lines changed

ggml/src/ggml-backend.cpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,23 +1383,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
13831383
} else {
13841384
ggml_backend_synchronize(split_backend);
13851385
}
1386-
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1387-
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1386+
ggml_backend_synchronize(input_backend);
13881387
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1389-
ggml_backend_synchronize(input_backend);
13901388
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
13911389
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
13921390
} else {
13931391
ggml_backend_synchronize(split_backend);
13941392
}
13951393
ggml_backend_tensor_copy(input, input_cpy);
13961394
}
1397-
else {
1398-
if (input_backend->iface.synchronize) {
1399-
// async copy succeeded, need to synchronize the input backend to ensure the copy is done before the split backend uses it
1400-
input_backend->iface.synchronize(input_backend);
1401-
}
1402-
}
14031395
}
14041396
}
14051397

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1913,10 +1913,7 @@ static bool ggml_backend_cann_cpy_tensor_async(
19131913
cann_ctx_src->task_queue.wait();
19141914
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
19151915
ACL_MEMCPY_DEVICE_TO_DEVICE,
1916-
cann_ctx_src->stream()));
1917-
1918-
//TODO: workaround for Event didn`t work here.
1919-
aclrtSynchronizeStream(cann_ctx_src->stream());
1916+
cann_ctx_dst->stream()));
19201917
} else {
19211918
// src and dst are on the same backend
19221919
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,7 +2460,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
24602460
#ifdef GGML_CUDA_NO_PEER_COPY
24612461
return false;
24622462
#else
2463-
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
2463+
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_dst->stream()));
24642464
#endif
24652465
}
24662466

@@ -2474,7 +2474,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
24742474
}
24752475
} else {
24762476
// src and dst are on the same backend
2477-
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
2477+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
24782478
}
24792479
return true;
24802480
}

0 commit comments

Comments
 (0)