Merge pull request #10202 from reyoung/feature/clean_memcpy_async

reyoung · web-flow · commit 01da25845e2c · 2018-04-27T17:36:18.000+08:00
Clean memcpy async
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
@@ -63,15 +63,9 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        platform::DeviceContextPool &pool =
-            platform::DeviceContextPool::Instance();
-        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Get(tensor.place()));
-
-        paddle::platform::GpuMemcpyAsync(
-            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
-            cudaMemcpyDeviceToHost, dev_ctx->stream());
-        dev_ctx->Wait();
+        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
+                                        sizeof(CUR_TYPE) * tensor.numel(),
+                                        cudaMemcpyDeviceToHost);
 #else
         PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -184,17 +178,8 @@ void PyCUDATensorSetFromArray(
 
   self->Resize(framework::make_ddim(dims));
   auto *dst = self->mutable_data<T>(place);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
-                                   cudaMemcpyHostToDevice, dev_ctx->stream());
-  // NOTE: For safety, here wait the copy complete.
-  // It because the CPU array.data() could be destroyed after this method.
-  // If we make this method async, it could be copied data from a memory buffer
-  // that has been freed.
-  dev_ctx->Wait();
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
 }
 
 template <>
@@ -214,18 +199,9 @@ void PyCUDATensorSetFromArray(
 
   self->Resize(framework::make_ddim(dims));
   auto *dst = self->mutable_data<platform::float16>(place);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-  paddle::platform::GpuMemcpyAsync(dst, array.data(),
-                                   sizeof(uint16_t) * array.size(),
-                                   cudaMemcpyHostToDevice, dev_ctx->stream());
-  // NOTE: For safety, here wait the copy complete.
-  // It because the CPU array.data() could be destroyed after this method.
-  // If we make this method async, it could be copied data from a memory buffer
-  // that has been freed.
-  dev_ctx->Wait();
+  paddle::platform::GpuMemcpySync(dst, array.data(),
+                                  sizeof(uint16_t) * array.size(),
+                                  cudaMemcpyHostToDevice);
 }
 
 template <typename T>