15
15
namespace paddle {
16
16
namespace operators {
17
17
18
- template <typename T, size_t D, int MajorType = Eigen::RowMajor,
19
- typename IndexType = Eigen::DenseIndex>
20
- using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
21
18
using framework::Tensor;
22
19
23
20
template <typename T>
24
- __global__ void KeBilinearInterpFw (
21
+ __global__ void KeNearestNeighborInterpFw (
25
22
const T* in, const size_t in_img_h, const size_t in_img_w,
26
23
const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
27
24
const size_t out_img_w, const size_t output_h, const size_t output_w,
28
- const size_t num_channels, const T ratio_h, const T ratioW ) {
25
+ const size_t num_channels, const T ratio_h, const T ratio_w ) {
29
26
int nthreads = output_h * output_w;
30
27
int tid = blockIdx .x * blockDim .x + threadIdx .x ;
31
28
if (tid < nthreads) {
@@ -36,34 +33,22 @@ __global__ void KeBilinearInterpFw(
36
33
int channel_id = out_id_w / out_img_size;
37
34
38
35
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
39
- int in_img_idy = ratio_h * out_img_idy;
40
- int h_id = (in_img_idy < in_img_h - 1 ) ? 1 : 0 ;
41
- T h1lambda = ratio_h * out_img_idy - in_img_idy;
42
- T h2lambda = 1 .f - h1lambda;
36
+ int in_img_idy = static_cast <int >(round (ratio_h * out_img_idy));
43
37
44
38
int out_img_idx = tid % out_img_w;
45
- int in_img_idx = ratioW * out_img_idx;
46
- int w_id = (in_img_idx < in_img_w - 1 ) ? 1 : 0 ;
47
- T w1lambda = ratioW * out_img_idx - in_img_idx;
48
- T w2lambda = 1 .f - w1lambda;
49
-
50
- const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
51
- in_img_idy * in_img_w + in_img_idx];
52
-
53
- // bilinear interpolation
54
- out[out_id_h * output_w + out_id_w] =
55
- h2lambda * (w2lambda * in_pos[0 ] + w1lambda * in_pos[w_id]) +
56
- h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
57
- w1lambda * in_pos[h_id * in_img_w + w_id]);
39
+ int in_img_idx = static_cast <int >(round (ratio_w * out_img_idx));
40
+
41
+ out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
42
+ in_img_idy * in_img_w + in_img_idx];
58
43
}
59
44
}
60
45
61
46
template <typename T>
62
- __global__ void KeBilinearInterpBw (
47
+ __global__ void KeNearestNeighborInterpBw (
63
48
T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
64
49
const size_t input_w, const T* out, const size_t out_img_h,
65
50
const size_t out_img_w, const size_t output_h, const size_t output_w,
66
- const size_t num_channels, const T ratio_h, const T ratioW ) {
51
+ const size_t num_channels, const T ratio_h, const T ratio_w ) {
67
52
int nthreads = output_h * output_w;
68
53
int tid = blockIdx .x * blockDim .x + threadIdx .x ;
69
54
if (tid < nthreads) {
@@ -74,25 +59,15 @@ __global__ void KeBilinearInterpBw(
74
59
int channel_id = out_id_w / out_img_size;
75
60
76
61
int out_img_idy = (out_id_w % out_img_size) / out_img_w;
77
- int in_img_idy = ratio_h * out_img_idy;
78
- int h_id = (in_img_idy < in_img_h - 1 ) ? 1 : 0 ;
79
- T h1lambda = ratio_h * out_img_idy - in_img_idy;
80
- T h2lambda = 1 .f - h1lambda;
62
+ int in_img_idy = static_cast <int >(round (ratio_h * out_img_idy));
81
63
82
64
int out_img_idx = tid % out_img_w;
83
- int in_img_idx = ratioW * out_img_idx;
84
- int w_id = (in_img_idx < in_img_w - 1 ) ? 1 : 0 ;
85
- T w1lambda = ratioW * out_img_idx - in_img_idx;
86
- T w2lambda = 1 .f - w1lambda;
65
+ int in_img_idx = static_cast <int >(round (ratio_w * out_img_idx));
87
66
88
67
T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
89
68
in_img_idy * in_img_w + in_img_idx];
90
- const T* out_pos = &out[out_id_h * output_w + out_id_w];
91
- atomicAdd (&in_pos[0 ], h2lambda * w2lambda * out_pos[0 ]);
92
- atomicAdd (&in_pos[w_id], h2lambda * w1lambda * out_pos[0 ]);
93
- atomicAdd (&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0 ]);
94
- atomicAdd (&in_pos[h_id * in_img_w + w_id],
95
- h1lambda * w1lambda * out_pos[0 ]);
69
+ const T out_pos = out[out_id_h * output_w + out_id_w];
70
+ atomicAdd (in_pos, out_pos);
96
71
}
97
72
}
98
73
@@ -102,101 +77,103 @@ class NearestNeighborInterpOpCUDAKernel : public framework::OpKernel<T> {
102
77
void Compute (const framework::ExecutionContext& ctx) const override {
103
78
PADDLE_ENFORCE (platform::is_gpu_place (ctx.GetPlace ()),
104
79
" This kernel only runs on GPU device." );
105
- auto * input_t = ctx.Input <Tensor>(" X" ); // float tensor
106
- auto * output_t = ctx.Output <Tensor>(" Out" ); // float tensor
107
- auto * input = input_t ->data <T>();
80
+ auto * input = ctx.Input <Tensor>(" X" ); // float tensor
81
+ auto * output = ctx.Output <Tensor>(" Out" ); // float tensor
82
+ auto * input_data = input ->data <T>();
108
83
109
84
int out_h = ctx.Attr <int >(" out_h" );
110
85
int out_w = ctx.Attr <int >(" out_w" );
111
- auto out_dims = output_t ->dims ();
112
- auto out_size_t = ctx.Input <Tensor>(" OutSize" );
113
- if (out_size_t != nullptr ) {
86
+ auto out_size = ctx.Input <Tensor>(" OutSize" );
87
+ if (out_size != nullptr ) {
114
88
Tensor sizes;
115
- framework::TensorCopy (*out_size_t , platform::CPUPlace (), &sizes);
89
+ framework::TensorCopy (*out_size , platform::CPUPlace (), &sizes);
116
90
auto size_data = sizes.data <int >();
117
91
out_h = size_data[0 ];
118
92
out_w = size_data[1 ];
119
93
}
120
- auto * output = output_t ->mutable_data <T>(
121
- {out_dims[0 ], out_dims[1 ], out_h, out_w}, ctx.GetPlace ());
122
94
123
- int batch_size = input_t ->dims ()[0 ];
124
- int channels = input_t ->dims ()[1 ];
125
- int in_h = input_t ->dims ()[2 ];
126
- int in_w = input_t ->dims ()[3 ];
95
+ int n = input->dims ()[0 ];
96
+ int c = input->dims ()[1 ];
97
+ int in_h = input->dims ()[2 ];
98
+ int in_w = input->dims ()[3 ];
99
+
100
+ auto * output_data =
101
+ output->mutable_data <T>({n, c, out_h, out_w}, ctx.GetPlace ());
127
102
128
103
int in_hw = in_h * in_w;
129
104
int out_hw = out_h * out_w;
130
- int in_chw = channels * in_hw;
131
- int out_chw = channels * out_hw;
105
+ int in_chw = c * in_hw;
106
+ int out_chw = c * out_hw;
132
107
133
108
T ratio_h = (out_h > 1 ) ? static_cast <T>(in_h - 1 ) / (out_h - 1 ) : 0 .f ;
134
109
T ratio_w = (out_w > 1 ) ? static_cast <T>(in_w - 1 ) / (out_w - 1 ) : 0 .f ;
135
110
136
111
if (in_h == out_h && in_w == out_w) {
137
- memcpy (output, input, input_t ->numel () * sizeof (T));
138
- } else {
139
- int threadNum = batch_size * out_chw;
140
- int blocks = (threadNum + 1024 - 1 ) / 1024 ;
141
-
142
- KeBilinearInterpFw<
143
- T><<<blocks, 1024 , 0 , ctx.cuda_device_context().stream()>>> (
144
- input, in_h, in_w, batch_size, in_chw, output, out_h, out_w,
145
- batch_size, out_chw, channels, ratio_h, ratio_w);
112
+ memcpy (output_data, input_data, input->numel () * sizeof (T));
113
+ return ;
146
114
}
115
+
116
+ int threadNum = n * out_chw;
117
+ int blocks = (threadNum + 1024 - 1 ) / 1024 ;
118
+
119
+ KeNearestNeighborInterpFw<
120
+ T><<<blocks, 1024 , 0 , ctx.cuda_device_context().stream()>>> (
121
+ input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
122
+ out_chw, c, ratio_h, ratio_w);
147
123
}
148
124
};
149
125
150
126
template <typename T>
151
127
class NearestNeighborInterpGradOpCUDAKernel : public framework ::OpKernel<T> {
152
128
public:
153
129
void Compute (const framework::ExecutionContext& ctx) const override {
154
- auto * d_input_t = ctx.Output <Tensor>(framework::GradVarName (" X" ));
155
- auto * d_output_t = ctx.Input <Tensor>(framework::GradVarName (" Out" ));
156
- auto * d_output = d_output_t ->data <T>();
157
- auto * d_input = d_input_t ->mutable_data <T>(ctx.GetPlace ());
130
+ auto * input_grad = ctx.Output <Tensor>(framework::GradVarName (" X" ));
131
+ auto * output_grad = ctx.Input <Tensor>(framework::GradVarName (" Out" ));
132
+ auto * output_grad_data = output_grad ->data <T>();
133
+ auto * input_grad_data = input_grad ->mutable_data <T>(ctx.GetPlace ());
158
134
159
135
auto & device_ctx =
160
136
ctx.template device_context <platform::CUDADeviceContext>();
161
137
math::SetConstant<platform::CUDADeviceContext, T> zero;
162
- zero (device_ctx, d_input_t , static_cast <T>(0.0 ));
138
+ zero (device_ctx, input_grad , static_cast <T>(0.0 ));
163
139
164
140
int out_h = ctx.Attr <int >(" out_h" );
165
141
int out_w = ctx.Attr <int >(" out_w" );
166
142
167
- auto out_size_t = ctx.Input <Tensor>(" OutSize" );
168
- if (out_size_t != nullptr ) {
143
+ auto out_size = ctx.Input <Tensor>(" OutSize" );
144
+ if (out_size != nullptr ) {
169
145
Tensor sizes;
170
- framework::TensorCopy (*out_size_t , platform::CPUPlace (), &sizes);
146
+ framework::TensorCopy (*out_size , platform::CPUPlace (), &sizes);
171
147
auto size_data = sizes.data <int >();
172
148
out_h = size_data[0 ];
173
149
out_w = size_data[1 ];
174
150
}
175
151
176
- int batch_size = d_input_t ->dims ()[0 ];
177
- int channels = d_input_t ->dims ()[1 ];
178
- int in_h = d_input_t ->dims ()[2 ];
179
- int in_w = d_input_t ->dims ()[3 ];
152
+ int n = input_grad ->dims ()[0 ];
153
+ int c = input_grad ->dims ()[1 ];
154
+ int in_h = input_grad ->dims ()[2 ];
155
+ int in_w = input_grad ->dims ()[3 ];
180
156
181
157
int in_hw = in_h * in_w;
182
158
int out_hw = out_h * out_w;
183
- int in_chw = channels * in_hw;
184
- int out_chw = channels * out_hw;
159
+ int in_chw = c * in_hw;
160
+ int out_chw = c * out_hw;
185
161
186
162
T ratio_h = (out_h > 1 ) ? static_cast <T>(in_h - 1 ) / (out_h - 1 ) : 0 .f ;
187
163
T ratio_w = (out_w > 1 ) ? static_cast <T>(in_w - 1 ) / (out_w - 1 ) : 0 .f ;
188
164
189
165
if (in_h == out_h && in_w == out_w) {
190
- memcpy (d_input, d_output, d_input_t ->numel () * sizeof (T));
191
- } else {
192
- int threadNum = batch_size * out_chw;
193
- int blocks = (threadNum + 1024 - 1 ) / 1024 ;
194
-
195
- KeBilinearInterpBw<
196
- T><<<blocks, 1024 , 0 , ctx.cuda_device_context().stream()>>> (
197
- d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w,
198
- batch_size, out_chw, channels, ratio_h, ratio_w);
166
+ memcpy (input_grad, output_grad, input_grad->numel () * sizeof (T));
167
+ return ;
199
168
}
169
+
170
+ int threadNum = n * out_chw;
171
+ int blocks = (threadNum + 1024 - 1 ) / 1024 ;
172
+
173
+ KeNearestNeighborInterpBw<
174
+ T><<<blocks, 1024 , 0 , ctx.cuda_device_context().stream()>>> (
175
+ input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
176
+ n, out_chw, c, ratio_h, ratio_w);
200
177
}
201
178
};
202
179
@@ -206,5 +183,5 @@ class NearestNeighborInterpGradOpCUDAKernel : public framework::OpKernel<T> {
206
183
namespace ops = paddle::operators;
207
184
REGISTER_OP_CUDA_KERNEL (nearest_neighbor_interp,
208
185
ops::NearestNeighborInterpOpCUDAKernel<float >);
209
- REGISTER_OP_CUDA_KERNEL (nearest_neighborinterp_grad ,
186
+ REGISTER_OP_CUDA_KERNEL (nearest_neighbor_interp_grad ,
210
187
ops::NearestNeighborInterpGradOpCUDAKernel<float >);
0 commit comments