@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
63
63
auto *dst_ptr = static_cast <void *>(dst_tensor.mutable_data <CUR_TYPE>(
64
64
tensor.dims (), platform::CPUPlace ()));
65
65
66
- platform::DeviceContextPool &pool =
67
- platform::DeviceContextPool::Instance ();
68
- auto dev_ctx = static_cast <const platform::CUDADeviceContext *>(
69
- pool.Get (tensor.place ()));
70
-
71
- paddle::platform::GpuMemcpyAsync (
72
- dst_ptr, src_ptr, sizeof (CUR_TYPE) * tensor.numel (),
73
- cudaMemcpyDeviceToHost, dev_ctx->stream ());
74
- dev_ctx->Wait ();
66
+ paddle::platform::GpuMemcpySync (dst_ptr, src_ptr,
67
+ sizeof (CUR_TYPE) * tensor.numel (),
68
+ cudaMemcpyDeviceToHost);
75
69
#else
76
70
PADDLE_THROW (" 'CUDAPlace' is not supported in CPU only device." );
77
71
#endif
@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
184
178
185
179
self->Resize (framework::make_ddim (dims));
186
180
auto *dst = self->mutable_data <T>(place);
187
-
188
- platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance ();
189
- auto dev_ctx =
190
- static_cast <const platform::CUDADeviceContext *>(pool.Get (place));
191
- paddle::platform::GpuMemcpyAsync (dst, array.data (), sizeof (T) * array.size (),
192
- cudaMemcpyHostToDevice, dev_ctx->stream ());
193
- // NOTE: For safety, here wait the copy complete.
194
- // It because the CPU array.data() could be destroyed after this method.
195
- // If we make this method async, it could be copied data from a memory buffer
196
- // that has been freed.
197
- dev_ctx->Wait ();
181
+ paddle::platform::GpuMemcpySync (dst, array.data (), sizeof (T) * array.size (),
182
+ cudaMemcpyHostToDevice);
198
183
}
199
184
200
185
template <>
@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
214
199
215
200
self->Resize (framework::make_ddim (dims));
216
201
auto *dst = self->mutable_data <platform::float16>(place);
217
-
218
- platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance ();
219
- auto dev_ctx =
220
- static_cast <const platform::CUDADeviceContext *>(pool.Get (place));
221
- paddle::platform::GpuMemcpyAsync (dst, array.data (),
222
- sizeof (uint16_t ) * array.size (),
223
- cudaMemcpyHostToDevice, dev_ctx->stream ());
224
- // NOTE: For safety, here wait the copy complete.
225
- // It because the CPU array.data() could be destroyed after this method.
226
- // If we make this method async, it could be copied data from a memory buffer
227
- // that has been freed.
228
- dev_ctx->Wait ();
202
+ paddle::platform::GpuMemcpySync (dst, array.data (),
203
+ sizeof (uint16_t ) * array.size (),
204
+ cudaMemcpyHostToDevice);
229
205
}
230
206
231
207
template <typename T>
0 commit comments