Skip to content

Commit 4fbde42

Browse files
chengduodzhwinter
authored andcommitted
Fix __shfl_down_sync_ of cross_entropy (#10345)
* fix __shfl_down_sync_ of cross_entropy * use reduceSum * "fix ci"
1 parent 6d5e582 commit 4fbde42

File tree

5 files changed

+88
-108
lines changed

5 files changed

+88
-108
lines changed

paddle/fluid/operators/elementwise_op_function.h

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License. */
2222
#ifdef __NVCC__
2323
#include <cuda.h>
2424
#include <thrust/iterator/iterator_adaptor.h>
25+
#include "paddle/fluid/platform/cuda_device_function.h"
2526
#include "paddle/fluid/platform/cuda_primitives.h"
2627
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
2728
#endif
@@ -336,43 +337,6 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
336337
}
337338

338339
#ifdef __NVCC__
339-
340-
template <typename T>
341-
__device__ T reduceSum(T val, int tid, int len) {
342-
// NOTE(zcd): The warp size should be taken from the
343-
// parameters of the GPU but not specified as 32 simply.
344-
// To make the reduceSum more efficiently,
345-
// I use Warp-Level Parallelism and assume the Warp size
346-
// is 32 which may be different for different GPU,
347-
// but most card's warp size is 32.
348-
const int warpSize = 32;
349-
__shared__ T shm[warpSize];
350-
unsigned mask = 0u;
351-
CREATE_SHFL_MASK(mask, tid < len);
352-
353-
for (int offset = warpSize / 2; offset > 0; offset /= 2)
354-
val += platform::__shfl_down_sync(mask, val, offset);
355-
356-
if (tid < warpSize) shm[tid] = 0;
357-
358-
__syncthreads();
359-
360-
if (tid % warpSize == 0) {
361-
shm[tid / warpSize] = val;
362-
}
363-
__syncthreads();
364-
365-
CREATE_SHFL_MASK(mask, tid < warpSize);
366-
367-
if (tid < warpSize) {
368-
val = shm[tid];
369-
for (int offset = warpSize / 2; offset > 0; offset /= 2)
370-
val += platform::__shfl_down_sync(mask, val, offset);
371-
}
372-
373-
return val;
374-
}
375-
376340
template <typename T, typename DX_OP, typename DY_OP>
377341
static __global__ void ElemwiseGradBroadcast1CUDAKernel(
378342
const T* x, const T* y, const T* out, const T* dout, int h, int w,
@@ -395,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
395359

396360
if (dy) {
397361
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
398-
val = reduceSum(val, tid, h);
362+
val = paddle::platform::reduceSum(val, tid, h);
399363
if (threadIdx.x == 0) {
400364
dy[j] = val;
401365
}
@@ -472,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
472436
if (dy) {
473437
int h = pre * post;
474438
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
475-
val = reduceSum(val, tid, h);
439+
val = paddle::platform::reduceSum(val, tid, h);
476440
if (threadIdx.x == 0) {
477441
dy[j] = val;
478442
}

paddle/fluid/operators/math/cross_entropy.cu

Lines changed: 10 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

1515
#include "paddle/fluid/operators/math/cross_entropy.h"
16+
#include "paddle/fluid/platform/cuda_device_function.h"
1617
#include "paddle/fluid/platform/cuda_primitives.h"
1718

1819
namespace paddle {
@@ -30,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
3031
}
3132
}
3233

33-
template <typename T>
34-
__device__ __forceinline__ T sum_single_warp(T val) {
35-
val += platform::__shfl_down_sync(0, val, 16);
36-
val += platform::__shfl_down_sync(0, val, 8);
37-
val += platform::__shfl_down_sync(0, val, 4);
38-
val += platform::__shfl_down_sync(0, val, 2);
39-
val += platform::__shfl_down_sync(0, val, 1);
40-
return val;
41-
}
42-
43-
// CUDA do not support dynamic arrary in template
44-
// https://stackoverflow.com/questions/20497209
45-
template <typename T>
46-
struct SharedMemory {
47-
// Ensure that we won't compile any un-specialized types
48-
__device__ T* GetPointer() { return NULL; }
49-
};
50-
51-
template <>
52-
struct SharedMemory<float> {
53-
__device__ float* GetPointer() {
54-
extern __shared__ float s_float[];
55-
return s_float;
56-
}
57-
};
58-
59-
template <>
60-
struct SharedMemory<double> {
61-
__device__ double* GetPointer() {
62-
extern __shared__ double s_double[];
63-
return s_double;
64-
}
65-
};
66-
6734
template <typename T>
6835
__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
6936
const int class_num) {
7037
int tid = threadIdx.x;
71-
SharedMemory<T> d_sum_shared;
72-
T* d_sum = d_sum_shared.GetPointer();
73-
d_sum[tid] = 0;
38+
T val = 0;
7439

75-
int cur_idx = tid;
76-
int next_idx = blockIdx.x * class_num + tid;
77-
while (cur_idx < class_num) {
78-
d_sum[tid] +=
79-
math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
80-
next_idx += blockDim.x;
81-
cur_idx += blockDim.x;
40+
int idx = blockIdx.x * class_num + tid;
41+
int end = blockIdx.x * class_num + class_num;
42+
for (; idx < end; idx += blockDim.x) {
43+
val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
8244
}
83-
__syncthreads();
8445

85-
for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
86-
if (tid < stride) d_sum[tid] += d_sum[tid + stride];
87-
__syncthreads();
46+
val = paddle::platform::reduceSum(val, tid, blockDim.x);
47+
if (threadIdx.x == 0) {
48+
Y[blockIdx.x] = -val;
8849
}
89-
90-
T val = d_sum[tid];
91-
val = sum_single_warp<T>(val);
92-
if (tid == 0) Y[blockIdx.x] = -val;
9350
}
9451
} // namespace
9552

@@ -113,9 +70,7 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
11370
? 512
11471
: pow(2, static_cast<int>(std::log2(class_num)));
11572

116-
SoftCrossEntropyKernel<T><<<
117-
batch_size, block, block * sizeof(T),
118-
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
73+
SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
11974
loss_data, prob_data, label_data, class_num);
12075
} else {
12176
const int64_t* label_data = labels->data<int64_t>();

paddle/fluid/operators/row_conv_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ limitations under the License. */
1414

1515
#include "paddle/fluid/operators/math/math_function.h"
1616
#include "paddle/fluid/operators/row_conv_op.h"
17-
#include "paddle/fluid/platform/cuda_primitives.h"
17+
#include "paddle/fluid/platform/cuda_device_function.h"
1818

1919
namespace paddle {
2020
namespace operators {
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
#include <cuda.h>
17+
18+
namespace paddle {
19+
namespace platform {
20+
21+
// __shfl_down and __shfl have been deprecated as of CUDA 9.0.
22+
#if CUDA_VERSION < 9000
23+
template <typename T>
24+
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
25+
return __shfl_down(val, delta);
26+
}
27+
28+
template <typename T>
29+
__forceinline__ __device__ T __shfl_sync(unsigned, T val, int src_line,
30+
int width) {
31+
return __shfl(val, src_line, width);
32+
}
33+
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
34+
#else
35+
#define FULL_WARP_MASK 0xFFFFFFFF
36+
#define CREATE_SHFL_MASK(mask, predicate) \
37+
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
38+
#endif
39+
40+
template <typename T>
41+
__device__ T reduceSum(T val, int tid, int len) {
42+
// NOTE(zcd): The warp size should be taken from the
43+
// parameters of the GPU but not specified as 32 simply.
44+
// To make the reduceSum more efficiently,
45+
// I use Warp-Level Parallelism and assume the Warp size
46+
// is 32 which may be different for different GPU,
47+
// but most card's warp size is 32.
48+
const int warpSize = 32;
49+
__shared__ T shm[warpSize];
50+
unsigned mask = 0u;
51+
CREATE_SHFL_MASK(mask, tid < len);
52+
53+
for (int offset = warpSize / 2; offset > 0; offset /= 2)
54+
val += platform::__shfl_down_sync(mask, val, offset);
55+
56+
if (tid < warpSize) shm[tid] = 0;
57+
58+
if (tid % warpSize == 0) {
59+
shm[tid / warpSize] = val;
60+
}
61+
__syncthreads();
62+
63+
CREATE_SHFL_MASK(mask, tid < warpSize);
64+
65+
if (tid < warpSize) {
66+
val = shm[tid];
67+
for (int offset = warpSize / 2; offset > 0; offset /= 2)
68+
val += platform::__shfl_down_sync(mask, val, offset);
69+
}
70+
return val;
71+
}
72+
73+
} // namespace platform
74+
} // namespace paddle

paddle/fluid/platform/cuda_primitives.h

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
6666
}
6767
#endif
6868

69-
// __shfl_down has been deprecated as of CUDA 9.0.
70-
#if CUDA_VERSION < 9000
71-
template <typename T>
72-
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
73-
return __shfl_down(val, delta);
74-
}
75-
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
76-
#else
77-
#define FULL_WARP_MASK 0xFFFFFFFF
78-
#define CREATE_SHFL_MASK(mask, predicate) \
79-
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
80-
#endif
81-
8269
} // namespace platform
8370
} // namespace paddle

0 commit comments

Comments
 (0)