Skip to content

Commit 01da258

Browse files
authored
Merge pull request #10202 from reyoung/feature/clean_memcpy_async
Clean memcpy async
2 parents 99714a7 + deabc8c commit 01da258

File tree

1 file changed

+8
-32
lines changed

1 file changed

+8
-32
lines changed

paddle/fluid/pybind/tensor_py.h

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
6363
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
6464
tensor.dims(), platform::CPUPlace()));
6565

66-
platform::DeviceContextPool &pool =
67-
platform::DeviceContextPool::Instance();
68-
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
69-
pool.Get(tensor.place()));
70-
71-
paddle::platform::GpuMemcpyAsync(
72-
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
73-
cudaMemcpyDeviceToHost, dev_ctx->stream());
74-
dev_ctx->Wait();
66+
paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
67+
sizeof(CUR_TYPE) * tensor.numel(),
68+
cudaMemcpyDeviceToHost);
7569
#else
7670
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
7771
#endif
@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
184178

185179
self->Resize(framework::make_ddim(dims));
186180
auto *dst = self->mutable_data<T>(place);
187-
188-
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
189-
auto dev_ctx =
190-
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
191-
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
192-
cudaMemcpyHostToDevice, dev_ctx->stream());
193-
// NOTE: For safety, here wait the copy complete.
194-
// It because the CPU array.data() could be destroyed after this method.
195-
// If we make this method async, it could be copied data from a memory buffer
196-
// that has been freed.
197-
dev_ctx->Wait();
181+
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
182+
cudaMemcpyHostToDevice);
198183
}
199184

200185
template <>
@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
214199

215200
self->Resize(framework::make_ddim(dims));
216201
auto *dst = self->mutable_data<platform::float16>(place);
217-
218-
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
219-
auto dev_ctx =
220-
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
221-
paddle::platform::GpuMemcpyAsync(dst, array.data(),
222-
sizeof(uint16_t) * array.size(),
223-
cudaMemcpyHostToDevice, dev_ctx->stream());
224-
// NOTE: For safety, here wait the copy complete.
225-
// It because the CPU array.data() could be destroyed after this method.
226-
// If we make this method async, it could be copied data from a memory buffer
227-
// that has been freed.
228-
dev_ctx->Wait();
202+
paddle::platform::GpuMemcpySync(dst, array.data(),
203+
sizeof(uint16_t) * array.size(),
204+
cudaMemcpyHostToDevice);
229205
}
230206

231207
template <typename T>

0 commit comments

Comments
 (0)