Merge pull request #10325 from chengduoZH/fix_shfl_sync

chengduo · web-flow · commit 3222cf16f741 · 2018-05-02T13:20:47.000+08:00
Fix shfl_sync for CUDA8.0
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
@@ -228,6 +228,21 @@ extern __thread cudaStream_t default_stream;
         << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
 #endif /* __NVCC__ */
 
 #endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,
 }
 
 __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
 #pragma unroll
   for (int k = 1; k < 32; k++) {
     // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
   }
 
 #pragma unroll
@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   }
 
   addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
 #pragma unroll
   for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(a[k], addr, 32);
-    addr = __shfl_sync(addr, (idx + 31) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
   }
 }
 
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,
     if (--beamSize == 0) break;
     __syncthreads();
 
+    unsigned mask = 0u;
+    // CREATE_SHFL_MASK(mask, tid < len);
+
     if (tid == maxId[0]) {
       if (beam < maxLength) {
         shTopK[tid] = topK[beam];
       }
     }
     if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
     }
   }
 }
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
@@ -74,10 +74,6 @@ __forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
 }
 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
 #else
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned mask, T val, int delta) {
-  return __shfl_down(mask, val, delta);
-}
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))

Original file line number	Diff line number	Diff line change
`@@ -341,12 +341,15 @@ void hl_lstm_parallel_forward(real *gateValue,`
`341`	`341`	`}`
`342`	`342`
`343`	`343`	`__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {`
`344`		`- int addr = idx % 32;`
	`344`	`+ const int warp_size = 32;`
	`345`	`+ int addr = idx % warp_size;`
	`346`	`+ unsigned mask = 0u;`
	`347`	`+ CREATE_SHFL_MASK(mask, addr < warp_size);`
`345`	`348`	`#pragma unroll`
`346`	`349`	`for (int k = 1; k < 32; k++) {`
`347`	`350`	`// rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);`
`348`		`- addr = __shfl_sync(addr, (idx + 1) % 32, 32);`
`349`		`- a[k] = __shfl_sync(a[k], addr, 32);`
	`351`	`+ addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);`
	`352`	`+ a[k] = __shfl_sync(mask, a[k], addr, 32);`
`350`	`353`	`}`
`351`	`354`
`352`	`355`	`#pragma unroll`
`@@ -360,10 +363,11 @@ __device__ __forceinline__ void transpose_32x32(real a[], const int idx) {`
`360`	`363`	`}`
`361`	`364`
`362`	`365`	`addr = (32 - idx) % 32;`
	`366`	`+ CREATE_SHFL_MASK(mask, idx % 32 < warp_size);`
`363`	`367`	`#pragma unroll`
`364`	`368`	`for (int k = 0; k < 32; k++) {`
`365`		`- a[k] = __shfl_sync(a[k], addr, 32);`
`366`		`- addr = __shfl_sync(addr, (idx + 31) % 32, 32);`
	`369`	`+ a[k] = __shfl_sync(mask, a[k], addr, 32);`
	`370`	`+ addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);`
`367`	`371`	`}`
`368`	`372`	`}`
`369`	`373`
Original file line number	Diff line number	Diff line change
`@@ -244,13 +244,16 @@ __device__ __forceinline__ void blockReduce(Pair* shTopK,`
`244`	`244`	`if (--beamSize == 0) break;`
`245`	`245`	`__syncthreads();`
`246`	`246`
	`247`	`+ unsigned mask = 0u;`
	`248`	`+ // CREATE_SHFL_MASK(mask, tid < len);`
	`249`	`+`
`247`	`250`	`if (tid == maxId[0]) {`
`248`	`251`	`if (beam < maxLength) {`
`249`	`252`	`shTopK[tid] = topK[beam];`
`250`	`253`	`}`
`251`	`254`	`}`
`252`	`255`	`if (maxId[0] / 32 == warp) {`
`253`		`- if (__shfl_sync(beam, (maxId[0]) % 32, 32) == maxLength) break;`
	`256`	`+ if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;`
`254`	`257`	`}`
`255`	`258`	`}`
`256`	`259`	`}`