Skip to content

Commit 05163e1

Browse files
authored
fix bug of prelu when rank not equal 4, test=develop (#25067) (#25235)
* fix bug of prelu when rank not equal 4, test=develop * fix prelu inference, test=develop * fix api, test=develop * fix shape when mode is chennel, test=develop * remove debug code, test=develop * add unittest, test=develop
1 parent 3ebc81d commit 05163e1

File tree

8 files changed

+146
-61
lines changed

8 files changed

+146
-61
lines changed

paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
// limitations under the License.
1414

1515
#include <stdio.h>
16+
1617
#include <cassert>
1718
#include <vector>
19+
1820
#include "glog/logging.h"
1921
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
2022
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
@@ -55,24 +57,23 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
5557
// const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
5658
const float *alpha = p_gpu_weight_;
5759
float *output = reinterpret_cast<float **>(outputs)[0];
58-
59-
std::vector<int> input_shape;
60-
input_shape.push_back(batch_size);
60+
int numel = 1;
6161
for (int i = 0; i < input_dims.nbDims; i++) {
62-
input_shape.push_back(input_dims.d[i]);
62+
numel *= input_dims.d[i];
6363
}
6464

6565
if (mode_ == "channel") {
6666
operators::math::PreluChannelWiseDirectCUDAFunctor<float>
6767
prelu_channel_wise;
68-
prelu_channel_wise(stream, input, alpha, output, input_shape);
68+
prelu_channel_wise(stream, input, alpha, output, input_dims.d[0],
69+
input_dims.d[1], numel);
6970
} else if (mode_ == "element") {
7071
operators::math::PreluElementWiseDirectCUDAFunctor<float>
7172
prelu_element_wise;
72-
prelu_element_wise(stream, input, alpha, output, input_shape);
73+
prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
7374
} else {
7475
operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
75-
prelu_scalar(stream, input, alpha, output, input_shape);
76+
prelu_scalar(stream, input, alpha, output, numel);
7677
}
7778
return cudaGetLastError() != cudaSuccess;
7879
}
@@ -133,23 +134,23 @@ int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
133134
const float *alpha = p_gpu_weight_;
134135
const float *input = static_cast<const float *>(inputs[0]);
135136
float *output = static_cast<float *>(outputs[0]);
136-
137-
std::vector<int> input_shape;
137+
int numel = 1;
138138
for (int i = 0; i < input_dims.nbDims; i++) {
139-
input_shape.push_back(input_dims.d[i]);
139+
numel *= input_dims.d[i];
140140
}
141141

142142
if (mode_ == "channel") {
143143
operators::math::PreluChannelWiseDirectCUDAFunctor<float>
144144
prelu_channel_wise;
145-
prelu_channel_wise(stream, input, alpha, output, input_shape);
145+
prelu_channel_wise(stream, input, alpha, output, input_dims.d[0],
146+
input_dims.d[1], numel);
146147
} else if (mode_ == "element") {
147148
operators::math::PreluElementWiseDirectCUDAFunctor<float>
148149
prelu_element_wise;
149-
prelu_element_wise(stream, input, alpha, output, input_shape);
150+
prelu_element_wise(stream, input, alpha, output, input_dims.d[0], numel);
150151
} else {
151152
operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
152-
prelu_scalar(stream, input, alpha, output, input_shape);
153+
prelu_scalar(stream, input, alpha, output, numel);
153154
}
154155
return cudaGetLastError() != cudaSuccess;
155156
}

paddle/fluid/operators/math/prelu.cu

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ namespace math {
2121
#define CUDA_NUM_THREADS 1024
2222

2323
// CUDA: grid stride looping
24-
#define CUDA_KERNEL_LOOP(i, n) \
25-
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
24+
#define CUDA_KERNEL_LOOP(i, n) \
25+
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
2626
i += blockDim.x * gridDim.x)
2727

2828
inline static int PADDLE_GET_BLOCKS(const int N) {
@@ -33,7 +33,6 @@ template <typename T>
3333
__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
3434
T *output, size_t channel_num,
3535
size_t plane_size, size_t numel) {
36-
size_t index;
3736
CUDA_KERNEL_LOOP(index, numel) {
3837
size_t temp = index / plane_size;
3938
size_t channel_index = temp % channel_num;
@@ -47,7 +46,6 @@ template <typename T>
4746
__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
4847
T *output, size_t spatial_size,
4948
size_t numel) {
50-
size_t index;
5149
CUDA_KERNEL_LOOP(index, numel) {
5250
size_t element_index = index % spatial_size;
5351
T scale = alpha[element_index];
@@ -60,7 +58,6 @@ template <typename T>
6058
__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
6159
size_t numel) {
6260
T scale = alpha[0];
63-
size_t index;
6461
CUDA_KERNEL_LOOP(index, numel) {
6562
T x = input[index];
6663
output[index] = (x > 0) ? x : scale * x;
@@ -70,34 +67,27 @@ __global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
7067
template <typename T>
7168
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
7269
cudaStream_t stream, const T *input, const T *alpha, T *output,
73-
std::vector<int> input_shape) {
74-
size_t plane_size = input_shape[2] * input_shape[3];
75-
size_t spatial_size = input_shape[1] * plane_size;
76-
size_t numel = input_shape[0] * spatial_size;
70+
size_t batch_size, size_t channel, size_t numel) {
7771
PReluChannelWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
78-
stream>>>(input, alpha, output, input_shape[1],
79-
plane_size, numel);
72+
stream>>>(input, alpha, output, channel,
73+
numel / batch_size / channel, numel);
8074
}
8175

8276
template <typename T>
83-
void PreluElementWiseDirectCUDAFunctor<T>::operator()(
84-
cudaStream_t stream, const T *input, const T *alpha, T *output,
85-
std::vector<int> input_shape) {
86-
size_t plane_size = input_shape[2] * input_shape[3];
87-
size_t spatial_size = input_shape[1] * plane_size;
88-
size_t numel = input_shape[0] * spatial_size;
77+
void PreluElementWiseDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
78+
const T *input,
79+
const T *alpha, T *output,
80+
size_t batch_size,
81+
size_t numel) {
8982
PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
90-
stream>>>(input, alpha, output, spatial_size, numel);
83+
stream>>>(input, alpha, output, numel / batch_size,
84+
numel);
9185
}
9286

9387
template <typename T>
9488
void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
9589
const T *input, const T *alpha,
96-
T *output,
97-
std::vector<int> input_shape) {
98-
size_t plane_size = input_shape[2] * input_shape[3];
99-
size_t spatial_size = input_shape[1] * plane_size;
100-
size_t numel = input_shape[0] * spatial_size;
90+
T *output, size_t numel) {
10191
PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
10292
input, alpha, output, numel);
10393
}

paddle/fluid/operators/math/prelu.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ limitations under the License. */
1414

1515
#pragma once
1616
#include <vector>
17+
1718
#include "paddle/fluid/operators/math/math_function.h"
1819
#include "paddle/fluid/platform/cudnn_helper.h"
1920

@@ -26,21 +27,21 @@ template <typename T>
2627
class PreluChannelWiseDirectCUDAFunctor {
2728
public:
2829
void operator()(cudaStream_t stream, const T *input, const T *alpha,
29-
T *output, std::vector<int> input_shape);
30+
T *output, size_t batch_size, size_t channel, size_t numel);
3031
};
3132

3233
template <typename T>
3334
class PreluElementWiseDirectCUDAFunctor {
3435
public:
3536
void operator()(cudaStream_t stream, const T *input, const T *alpha,
36-
T *output, std::vector<int> input_shape);
37+
T *output, size_t batch_size, size_t numel);
3738
};
3839

3940
template <typename T>
4041
class PreluScalarDirectCUDAFunctor {
4142
public:
4243
void operator()(cudaStream_t stream, const T *input, const T *alpha,
43-
T *output, std::vector<int> input_shape);
44+
T *output, size_t numel);
4445
};
4546

4647
#endif

paddle/fluid/operators/prelu_op.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
1010
limitations under the License. */
1111

1212
#include "paddle/fluid/operators/prelu_op.h"
13+
1314
#include <memory>
1415
#include <string>
1516

@@ -43,10 +44,23 @@ class PReluOp : public framework::OperatorWithKernel {
4344
"equal to the number of channels of input(x). But "
4445
"recevied alpha's size: %d, x_dim[1]: %d",
4546
product(ctx->GetInputDim("Alpha")), x_dim[1]));
47+
auto x_rank = x_dim.size();
48+
PADDLE_ENFORCE_GE(x_rank, 2,
49+
platform::errors::InvalidArgument(
50+
"For mode 'channel', rank of input X must be "
51+
"equal or larger than 2. But recevied X's "
52+
"rank: %d",
53+
x_rank));
4654
} else if (mode == "element") {
4755
auto alpha_dim = ctx->GetInputDim("Alpha");
4856
auto alpha_rank = alpha_dim.size();
4957
auto x_rank = x_dim.size();
58+
PADDLE_ENFORCE_GE(x_rank, 1,
59+
platform::errors::InvalidArgument(
60+
"For mode 'element', rank of input X must be "
61+
"equal or larger than 2. But recevied X's "
62+
"rank: %d",
63+
x_rank));
5064
PADDLE_ENFORCE_EQ(
5165
alpha_rank, x_rank,
5266
platform::errors::InvalidArgument(

paddle/fluid/operators/prelu_op.cu

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ limitations under the License. */
1111

1212
#include <string>
1313
#include <vector>
14+
1415
#include "paddle/fluid/framework/op_registry.h"
1516
#include "paddle/fluid/operators/math/prelu.h"
1617
#include "paddle/fluid/operators/prelu_op.h"
@@ -49,20 +50,22 @@ class CUDAPReluKernel : public framework::OpKernel<T> {
4950

5051
int numel = x->numel();
5152
auto dim = x->dims();
52-
std::vector<int> input_shape = framework::vectorize<int>(dim);
53+
54+
VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1]
55+
<< ", numel:" << numel;
5356

5457
if (mode == "channel") {
5558
math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
5659
prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
57-
alpha_ptr, o_ptr, input_shape);
60+
alpha_ptr, o_ptr, dim[0], dim[1], numel);
5861
} else if (mode == "element") {
5962
math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
6063
prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
61-
alpha_ptr, o_ptr, input_shape);
64+
alpha_ptr, o_ptr, dim[0], numel);
6265
} else {
6366
math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
6467
prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
65-
o_ptr, input_shape);
68+
o_ptr, numel);
6669
}
6770
}
6871
};
@@ -75,7 +78,6 @@ __global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
7578
size_t channel_num, size_t plane_size,
7679
size_t spatial_size, size_t numel,
7780
PRELU_MODE mode) {
78-
size_t index;
7981
CUDA_KERNEL_LOOP(index, numel) {
8082
T scale;
8183
if (mode == Element) {
@@ -99,14 +101,18 @@ template <typename T>
99101
class PreluOpGradFunctor {
100102
public:
101103
void operator()(cudaStream_t stream, const T* x, const T* alpha, const T* dy,
102-
T* dx, T* dalpha, std::vector<int> input_shape,
104+
T* dx, T* dalpha, const framework::DDim& input_dims,
103105
PRELU_MODE mode) {
104-
size_t plane_size = input_shape[2] * input_shape[3];
105-
size_t spatial_size = plane_size * input_shape[1];
106-
size_t numel = spatial_size * input_shape[0];
106+
size_t numel = 1;
107+
for (size_t i = 0; i < input_dims.size(); ++i) {
108+
numel *= input_dims[i];
109+
}
110+
size_t plane_size = numel / input_dims[0] / input_dims[1];
111+
size_t spatial_size = numel / input_dims[0];
112+
107113
PReluOpGradKernel<
108114
T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
109-
x, alpha, dy, dx, dalpha, input_shape[1], plane_size, spatial_size,
115+
x, alpha, dy, dx, dalpha, input_dims[1], plane_size, spatial_size,
110116
numel, mode);
111117
}
112118
};
@@ -161,13 +167,13 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
161167
m = Scalar;
162168
}
163169
PreluOpGradFunctor<T> prelu_grad;
164-
prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr,
165-
input_shape, m);
170+
prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim,
171+
m);
166172

167173
if (dalpha_tmp_ptr == nullptr) return;
168174

169175
std::vector<int> reduce_dims;
170-
for (size_t i = 0; i < input_shape.size(); i++) {
176+
for (size_t i = 0; i < dim.size(); i++) {
171177
if (mode == "channel" && i == 1) continue;
172178
if (mode == "element" && i != 0) continue;
173179
reduce_dims.push_back(i);

python/paddle/fluid/dygraph/nn.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2262,7 +2262,10 @@ def __init__(self,
22622262
assert isinstance(
22632263
channel,
22642264
int), "channel argument is required when mode is 'channel'."
2265-
self._alpha_shape = [1, channel, 1, 1]
2265+
#NOTE(zhiqiu): The _alpha_shape should be [1, channel] + [1] * len(input_shape[2:]), not [1, channel, 1, 1].
2266+
# However, the suffix 1 in the list is useless, since the tensor is viewed as one demension array during kernel calculation.
2267+
# And, input_shape is not required when mode is 'channel', so it is simplified.
2268+
self._alpha_shape = [1, channel]
22662269
elif mode == 'element':
22672270
assert isinstance(input_shape, (
22682271
list, tuple

python/paddle/fluid/layers/nn.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10661,10 +10661,20 @@ def prelu(x, mode, param_attr=None, name=None):
1066110661
if mode not in ['all', 'channel', 'element']:
1066210662
raise ValueError('mode should be one of all, channel, element.')
1066310663
alpha_shape = [1]
10664+
# NOTE(): The input of this API should be ``N,C,...`` format,
10665+
# which means x.shape[0] is batch_size and x.shape[0] is channel.
1066410666
if mode == 'channel':
10665-
alpha_shape = [1, x.shape[1], 1, 1]
10667+
assert len(
10668+
x.shape
10669+
) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
10670+
#NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
10671+
# To be consistent with Prelu, it is simplified.
10672+
alpha_shape = [1, x.shape[1]]
1066610673
elif mode == 'element':
10667-
alpha_shape = [1, x.shape[1], x.shape[2], x.shape[3]]
10674+
assert len(
10675+
x.shape
10676+
) >= 1, "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
10677+
alpha_shape = [1] + list(x.shape)[1:]
1066810678
dtype = helper.input_dtype(input_param_name='x')
1066910679
alpha = helper.create_parameter(
1067010680
attr=helper.param_attr,

0 commit comments

Comments
 (0)