@@ -32,6 +32,18 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
32
32
#ifdef PADDLE_WITH_CUDA
33
33
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024 ; // 64K
34
34
35
+ inline void SyncCUDAStream () {
36
+ #if !defined(_WIN32)
37
+ cudaStreamSynchronize (0 );
38
+ #else
39
+ cudaError_t e_sync = cudaSuccess;
40
+ while (e_sync = cudaStreamQuery (0 )) {
41
+ if (e_sync == cudaErrorNotReady) continue ;
42
+ break ;
43
+ }
44
+ #endif
45
+ }
46
+
35
47
// NOTE(zcd): Do not use GpuMemcpySync as much as possible.
36
48
// because GpuMemcpySync issues the copying command to the default stream,
37
49
// which will make two commands from different streams cannot run concurrently.
@@ -55,7 +67,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
55
67
platform::GpuMemcpySync (dst, src, num, cudaMemcpyDeviceToHost);
56
68
// FIXME(zjl): do we really need it?
57
69
if (num <= kMaxGpuAsyncCopyBytes ) {
58
- cudaStreamSynchronize ( 0 );
70
+ SyncCUDAStream ( );
59
71
}
60
72
}
61
73
}
@@ -77,7 +89,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
77
89
platform::GpuMemcpySync (dst, src, num, cudaMemcpyHostToDevice);
78
90
// FIXME(zjl): do we really need it?
79
91
if (num <= kMaxGpuAsyncCopyBytes ) {
80
- cudaStreamSynchronize ( 0 );
92
+ SyncCUDAStream ( );
81
93
}
82
94
}
83
95
}
0 commit comments