|
13 | 13 | // limitations under the License.
|
14 | 14 |
|
15 | 15 | #include <gtest/gtest.h>
|
| 16 | +#include <algorithm> |
16 | 17 | #include <iostream>
|
17 | 18 | #include <random>
|
18 | 19 |
|
@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
|
123 | 124 | cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
|
124 | 125 | cudaDeviceSynchronize();
|
125 | 126 | for (size_t i = 0; i < num / 2; ++i) {
|
126 |
| - // NOTE(dzhwinter): the float16 add has small underflow/overflow |
| 127 | + // NOTE(dzhwinter): the float16 add has small truncate error. |
127 | 128 | // so we use EXPECT_NEAR to check the result.
|
128 | 129 | EXPECT_NEAR(static_cast<float>(out[i]),
|
129 | 130 | static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
|
@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
|
151 | 152 | TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
|
152 | 153 | TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
|
153 | 154 | }
|
| 155 | + |
| 156 | +// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ |
| 157 | +template <typename T> |
| 158 | +static __forceinline__ __device__ T WarpReduceSum(T val) { |
| 159 | + unsigned mask = 0u; |
| 160 | + CREATE_SHFL_MASK(mask, true); |
| 161 | + for (int offset = warpSize / 2; offset > 0; offset /= 2) { |
| 162 | + val += paddle::platform::CudaShuffleDownSync(mask, val, offset); |
| 163 | + } |
| 164 | + return val; |
| 165 | +} |
| 166 | + |
| 167 | +template <typename T> |
| 168 | +__forceinline__ __device__ T BlockReduce(T val) { |
| 169 | + static __shared__ T shared[32]; // Shared mem for 32 partial sums |
| 170 | + int lane = threadIdx.x % warpSize; |
| 171 | + int wid = threadIdx.x / warpSize; |
| 172 | + |
| 173 | + val = WarpReduceSum(val); // Each warp performs partial reduction |
| 174 | + |
| 175 | + if (lane == 0) shared[wid] = val; // Write reduced value to shared memory |
| 176 | + |
| 177 | + __syncthreads(); // Wait for all partial reductions |
| 178 | + |
| 179 | + // read from shared memory only if that warp existed |
| 180 | + val = |
| 181 | + (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0); |
| 182 | + |
| 183 | + if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp |
| 184 | + |
| 185 | + return val; |
| 186 | +} |
| 187 | + |
| 188 | +template <typename T> |
| 189 | +__global__ void DeviceReduceSum(T* in, T* out, size_t N) { |
| 190 | + T sum(0); |
| 191 | + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; |
| 192 | + i += blockDim.x * gridDim.x) { |
| 193 | + sum += in[i]; |
| 194 | + } |
| 195 | + sum = BlockReduce<T>(sum); |
| 196 | + __syncthreads(); |
| 197 | + if (threadIdx.x == 0) out[blockIdx.x] = sum; |
| 198 | +} |
| 199 | + |
| 200 | +template <typename T> |
| 201 | +void TestReduce(size_t num, float atol = 0.01) { |
| 202 | + T* in1; |
| 203 | + T *d_in1, *d_in2; |
| 204 | + size_t size = sizeof(T) * num; |
| 205 | + cudaMalloc(reinterpret_cast<void**>(&d_in1), size); |
| 206 | + cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T)); |
| 207 | + in1 = reinterpret_cast<T*>(malloc(size)); |
| 208 | + std::minstd_rand engine; |
| 209 | + std::uniform_real_distribution<double> dist(0.0, 1.0); |
| 210 | + for (size_t i = 0; i < num; ++i) { |
| 211 | + in1[i] = static_cast<T>(dist(engine)); |
| 212 | + } |
| 213 | + auto out = std::accumulate(in1, in1 + num, static_cast<T>(0)); |
| 214 | + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); |
| 215 | + cudaDeviceSynchronize(); |
| 216 | + DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); |
| 217 | + cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost); |
| 218 | + cudaDeviceSynchronize(); |
| 219 | + // NOTE(dzhwinter): the float16 add has small underflow/overflow |
| 220 | + // so we use EXPECT_NEAR to check the result. |
| 221 | + EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol); |
| 222 | + free(in1); |
| 223 | + cudaFree(d_in1); |
| 224 | + cudaFree(d_in2); |
| 225 | +} |
| 226 | + |
| 227 | +TEST(CudaShuffleSync, float16) { |
| 228 | + TestReduce<float>(10); |
| 229 | + TestReduce<float>(1000); |
| 230 | + |
| 231 | + // float16 will overflow or accumulate truncate errors in big size. |
| 232 | + TestReduce<float16>(10); |
| 233 | + TestReduce<float16>(100, /*atol error*/ 1.0); |
| 234 | +} |
0 commit comments