[cherry-pick 1.8]fix randomly hang issue of PaddleDetection training task on windows (#24980)

liupluswei · web-flow · commit 1185a96f5998 · 2020-06-10T09:12:44.000+08:00
* cherry-pick #24977
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
@@ -32,6 +32,18 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
 #ifdef PADDLE_WITH_CUDA
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  cudaStreamSynchronize(0);
+#else
+  cudaError_t e_sync = cudaSuccess;
+  while (e_sync = cudaStreamQuery(0)) {
+    if (e_sync == cudaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
+
 // NOTE(zcd): Do not use GpuMemcpySync as much as possible.
 // because GpuMemcpySync issues the copying command to the default stream,
 // which will make two commands from different streams cannot run concurrently.
@@ -55,7 +67,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
     }
   }
 }
@@ -77,7 +89,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
-      cudaStreamSynchronize(0);
+      SyncCUDAStream();
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,18 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,`
`32`	`32`	`#ifdef PADDLE_WITH_CUDA`
`33`	`33`	`static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K`
`34`	`34`
	`35`	`+inline void SyncCUDAStream() {`
	`36`	`+#if !defined(_WIN32)`
	`37`	`+ cudaStreamSynchronize(0);`
	`38`	`+#else`
	`39`	`+ cudaError_t e_sync = cudaSuccess;`
	`40`	`+ while (e_sync = cudaStreamQuery(0)) {`
	`41`	`+ if (e_sync == cudaErrorNotReady) continue;`
	`42`	`+ break;`
	`43`	`+ }`
	`44`	`+#endif`
	`45`	`+}`
	`46`	`+`
`35`	`47`	`// NOTE(zcd): Do not use GpuMemcpySync as much as possible.`
`36`	`48`	`// because GpuMemcpySync issues the copying command to the default stream,`
`37`	`49`	`// which will make two commands from different streams cannot run concurrently.`
`@@ -55,7 +67,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(`
`55`	`67`	`platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);`
`56`	`68`	`// FIXME(zjl): do we really need it?`
`57`	`69`	`if (num <= kMaxGpuAsyncCopyBytes) {`
`58`		`- cudaStreamSynchronize(0);`
	`70`	`+ SyncCUDAStream();`
`59`	`71`	`}`
`60`	`72`	`}`
`61`	`73`	`}`
`@@ -77,7 +89,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(`
`77`	`89`	`platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);`
`78`	`90`	`// FIXME(zjl): do we really need it?`
`79`	`91`	`if (num <= kMaxGpuAsyncCopyBytes) {`
`80`		`- cudaStreamSynchronize(0);`
	`92`	`+ SyncCUDAStream();`
`81`	`93`	`}`
`82`	`94`	`}`
`83`	`95`	`}`