Skip to content

Commit 2673798

Browse files
authored
"fix float16 ShuffleDownSync Bug" (#12756)
* "fix bug" * "add test case"
1 parent 6fe5547 commit 2673798

File tree

2 files changed

+93
-5
lines changed

2 files changed

+93
-5
lines changed

paddle/fluid/platform/cuda_device_function.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
3636
#if CUDA_VERSION < 9000
3737
return __shfl_down(val, delta, width);
3838
#else
39-
return __shfl_down_sync(mask, val, delta, width);
39+
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
4040
#endif
4141
}
4242

@@ -46,9 +46,16 @@ template <>
4646
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
4747
float16 val, int delta,
4848
int width) {
49-
half tmp = static_cast<half>(val);
50-
__shfl_down(tmp, static_cast<unsigned>(delta), width);
51-
return float16(tmp);
49+
return float16(
50+
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
51+
}
52+
#else
53+
template <>
54+
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
55+
float16 val, int delta,
56+
int width) {
57+
return float16(__shfl_down_sync(mask, static_cast<half>(val),
58+
static_cast<unsigned>(delta), width));
5259
}
5360
#endif
5461

paddle/fluid/platform/cuda_helper_test.cu

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
#include <gtest/gtest.h>
16+
#include <algorithm>
1617
#include <iostream>
1718
#include <random>
1819

@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
123124
cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
124125
cudaDeviceSynchronize();
125126
for (size_t i = 0; i < num / 2; ++i) {
126-
// NOTE(dzhwinter): the float16 add has small underflow/overflow
127+
// NOTE(dzhwinter): the float16 add has small truncate error.
127128
// so we use EXPECT_NEAR to check the result.
128129
EXPECT_NEAR(static_cast<float>(out[i]),
129130
static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
151152
TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
152153
TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
153154
}
155+
156+
// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
157+
template <typename T>
158+
static __forceinline__ __device__ T WarpReduceSum(T val) {
159+
unsigned mask = 0u;
160+
CREATE_SHFL_MASK(mask, true);
161+
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
162+
val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
163+
}
164+
return val;
165+
}
166+
167+
template <typename T>
168+
__forceinline__ __device__ T BlockReduce(T val) {
169+
static __shared__ T shared[32]; // Shared mem for 32 partial sums
170+
int lane = threadIdx.x % warpSize;
171+
int wid = threadIdx.x / warpSize;
172+
173+
val = WarpReduceSum(val); // Each warp performs partial reduction
174+
175+
if (lane == 0) shared[wid] = val; // Write reduced value to shared memory
176+
177+
__syncthreads(); // Wait for all partial reductions
178+
179+
// read from shared memory only if that warp existed
180+
val =
181+
(threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
182+
183+
if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp
184+
185+
return val;
186+
}
187+
188+
template <typename T>
189+
__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
190+
T sum(0);
191+
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
192+
i += blockDim.x * gridDim.x) {
193+
sum += in[i];
194+
}
195+
sum = BlockReduce<T>(sum);
196+
__syncthreads();
197+
if (threadIdx.x == 0) out[blockIdx.x] = sum;
198+
}
199+
200+
template <typename T>
201+
void TestReduce(size_t num, float atol = 0.01) {
202+
T* in1;
203+
T *d_in1, *d_in2;
204+
size_t size = sizeof(T) * num;
205+
cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
206+
cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
207+
in1 = reinterpret_cast<T*>(malloc(size));
208+
std::minstd_rand engine;
209+
std::uniform_real_distribution<double> dist(0.0, 1.0);
210+
for (size_t i = 0; i < num; ++i) {
211+
in1[i] = static_cast<T>(dist(engine));
212+
}
213+
auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
214+
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
215+
cudaDeviceSynchronize();
216+
DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
217+
cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
218+
cudaDeviceSynchronize();
219+
// NOTE(dzhwinter): the float16 add has small underflow/overflow
220+
// so we use EXPECT_NEAR to check the result.
221+
EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
222+
free(in1);
223+
cudaFree(d_in1);
224+
cudaFree(d_in2);
225+
}
226+
227+
TEST(CudaShuffleSync, float16) {
228+
TestReduce<float>(10);
229+
TestReduce<float>(1000);
230+
231+
// float16 will overflow or accumulate truncate errors in big size.
232+
TestReduce<float16>(10);
233+
TestReduce<float16>(100, /*atol error*/ 1.0);
234+
}

0 commit comments

Comments
 (0)