Skip to content

Commit 0c24b3f

Browse files
committed
Clean memcpy async
1 parent bfbbe19 commit 0c24b3f

File tree

2 files changed

+8
-33
lines changed

2 files changed

+8
-33
lines changed

paddle/fluid/framework/details/fetch_op_handle.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ void FetchOpHandle::RunImpl() {
6767
if (platform::is_gpu_place(t.place())) {
6868
#ifdef PADDLE_WITH_CUDA
6969
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
70-
dev_ctxes_.at(t.place())->Wait();
7170
#endif
7271
} else {
7372
tensors_[i].ShareDataWith(t);

paddle/fluid/pybind/tensor_py.h

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
6363
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
6464
tensor.dims(), platform::CPUPlace()));
6565

66-
platform::DeviceContextPool &pool =
67-
platform::DeviceContextPool::Instance();
68-
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
69-
pool.Get(tensor.place()));
70-
71-
paddle::platform::GpuMemcpyAsync(
72-
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
73-
cudaMemcpyDeviceToHost, dev_ctx->stream());
74-
dev_ctx->Wait();
66+
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
67+
sizeof(CUR_TYPE) * tensor.numel(),
68+
cudaMemcpyDeviceToHost);
7569
#else
7670
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
7771
#endif
@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
184178

185179
self->Resize(framework::make_ddim(dims));
186180
auto *dst = self->mutable_data<T>(place);
187-
188-
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
189-
auto dev_ctx =
190-
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
191-
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
192-
cudaMemcpyHostToDevice, dev_ctx->stream());
193-
// NOTE: For safety, here wait the copy complete.
194-
// It because the CPU array.data() could be destroyed after this method.
195-
// If we make this method async, it could be copied data from a memory buffer
196-
// that has been freed.
197-
dev_ctx->Wait();
181+
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
182+
cudaMemcpyHostToDevice);
198183
}
199184

200185
template <>
@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
214199

215200
self->Resize(framework::make_ddim(dims));
216201
auto *dst = self->mutable_data<platform::float16>(place);
217-
218-
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
219-
auto dev_ctx =
220-
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
221-
paddle::platform::GpuMemcpyAsync(dst, array.data(),
222-
sizeof(uint16_t) * array.size(),
223-
cudaMemcpyHostToDevice, dev_ctx->stream());
224-
// NOTE: For safety, here wait the copy complete.
225-
// It because the CPU array.data() could be destroyed after this method.
226-
// If we make this method async, it could be copied data from a memory buffer
227-
// that has been freed.
228-
dev_ctx->Wait();
202+
paddle::platform::GpuMemcpySync(dst, array.data(),
203+
sizeof(uint16_t) * array.size(),
204+
cudaMemcpyHostToDevice);
229205
}
230206

231207
template <typename T>

0 commit comments

Comments
 (0)