Skip to content

Commit 20bdc3e

Browse files
author
Yibing Liu
authored
Merge pull request #10846 from kuke/deconv_group
Add groups for conv transpose ops
2 parents 55d3951 + 4bafbf4 commit 20bdc3e

File tree

6 files changed

+224
-75
lines changed

6 files changed

+224
-75
lines changed

paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
4444
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
4545
// cudnn v5 does not support dilations
4646
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
47+
int groups = ctx.Attr<int>("groups");
4748
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
4849

4950
const T* input_data = input->data<T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
6465

6566
// (N, M, H, W) or (N, M, D, H, W)
6667
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
67-
layout, framework::vectorize2int(input->dims()));
68+
layout, framework::vectorize2int(input->dims()), groups);
6869
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
6970
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
70-
layout, framework::vectorize2int(output->dims()));
71+
layout, framework::vectorize2int(output->dims()), groups);
7172
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
7273
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
73-
layout, framework::vectorize2int(filter->dims()));
74+
layout, framework::vectorize2int(filter->dims()), groups);
7475
cudnnConvolutionDescriptor_t cudnn_conv_desc =
7576
conv_desc.descriptor<T>(paddings, strides, dilations);
7677

@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
104105
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
105106

106107
// ------------------- cudnn conv transpose forward ---------------------
108+
int input_offset = input->numel() / input->dims()[0] / groups;
109+
int output_offset = output->numel() / output->dims()[0] / groups;
110+
int filter_offset = filter->numel() / groups;
107111
T alpha = 1.0f, beta = 0.0f;
108-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
109-
handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
110-
input_data, cudnn_conv_desc, algo, cudnn_workspace,
111-
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
112+
for (int g = 0; g < groups; g++) {
113+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
114+
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
115+
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
116+
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
117+
cudnn_output_desc, output_data + output_offset * g));
118+
}
112119

113120
// Release the cudnn workspace
114121
paddle::memory::Free(gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
134141
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
135142
// cudnn v5 does not support dilations
136143
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
144+
int groups = ctx.Attr<int>("groups");
137145
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
138146

139147
// ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
145153

146154
// Input: (N, M, H, W) or (N, M, D, H, W)
147155
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
148-
layout, framework::vectorize2int(input->dims()));
156+
layout, framework::vectorize2int(input->dims()), groups);
149157
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
150158
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
151-
layout, framework::vectorize2int(output_grad->dims()));
159+
layout, framework::vectorize2int(output_grad->dims()), groups);
152160
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
153161
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
154-
layout, framework::vectorize2int(filter->dims()));
162+
layout, framework::vectorize2int(filter->dims()), groups);
155163

156164
cudnnConvolutionDescriptor_t cudnn_conv_desc =
157165
conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -205,27 +213,39 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
205213
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
206214
// ------------------- cudnn conv backward data ---------------------
207215
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
216+
int input_offset = input->numel() / input->dims()[0] / groups;
217+
int output_grad_offset =
218+
output_grad->numel() / output_grad->dims()[0] / groups;
219+
int filter_offset = filter->numel() / groups;
208220
T alpha = 1.0f, beta = 0.0f;
209221
if (input_grad) {
210222
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
211223
// Because beta is zero, it is unnecessary to reset input_grad.
212-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
213-
handle, &alpha, cudnn_output_desc, output_grad_data,
214-
cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
215-
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
216-
input_grad_data));
224+
for (int g = 0; g < groups; g++) {
225+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
226+
handle, &alpha, cudnn_output_desc,
227+
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
228+
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
229+
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
230+
input_grad_data + input_offset * g));
231+
}
217232
}
218233

219234
// ------------------- cudnn conv backward filter ---------------------
220235
if (filter_grad) {
221236
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
222237
// Because beta is zero, it is unnecessary to reset filter_grad.
223238
// Gradient with respect to the filter
224-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
225-
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
226-
input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
227-
workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
239+
for (int g = 0; g < groups; g++) {
240+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
241+
handle, &alpha, cudnn_output_desc,
242+
output_grad_data + output_grad_offset * g, cudnn_input_desc,
243+
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
244+
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
245+
filter_grad_data + filter_offset * g));
246+
}
228247
}
248+
229249
// Release the cudnn workspace
230250
paddle::memory::Free(gpu, cudnn_workspace);
231251
}

paddle/fluid/operators/conv_transpose_op.cc

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
3232
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
3333
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
3434
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
35+
int groups = ctx->Attrs().Get<int>("groups");
3536

3637
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
3738
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -48,10 +49,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
4849
"ConvTransposeOp paddings dimension and dilations "
4950
"dimension should be the same.");
5051
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
51-
"In ConvTransposeOp, The input channel should be the same "
52-
"as the number of filters.");
52+
"In ConvTransposeOp, The number of input channels should "
53+
"be equal to the number of filter's channels.");
5354

54-
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
55+
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
5556
for (size_t i = 0; i < strides.size(); ++i) {
5657
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
5758
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
@@ -102,7 +103,10 @@ void Conv2DTransposeOpMaker::Make() {
102103
AddOutput("Output",
103104
"(Tensor) The output tensor of convolution transpose operator. "
104105
"The format of output tensor is also NCHW.");
105-
106+
AddAttr<int>("groups",
107+
"(int default:1), the groups number of the convolution "
108+
"transpose operator. ")
109+
.SetDefault(1);
106110
AddAttr<std::vector<int>>("dilations",
107111
"(vector<int> default:{1, 1}), the "
108112
"dilations(h_dilation, w_dilation) of convolution "
@@ -204,6 +208,10 @@ void Conv3DTransposeOpMaker::Make() {
204208
"(vector<int> default:{0, 0, 0}), paddings(d_pad, "
205209
"h_pad, w_pad) of convolution transpose operator.")
206210
.SetDefault({0, 0, 0});
211+
AddAttr<int>("groups",
212+
"(int default:1), the groups number of the convolution3d "
213+
"transpose operator. ")
214+
.SetDefault(1);
207215
AddAttr<bool>(
208216
"use_cudnn",
209217
"(bool, default false) Only used in cudnn kernel, need install cudnn")

paddle/fluid/operators/conv_transpose_op.h

Lines changed: 58 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
7070
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
7171
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
7272
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
73-
// groups will alway be disabled in conv2dtranspose.
73+
int groups = context.Attr<int>("groups");
7474

7575
const int batch_size = static_cast<int>(input->dims()[0]);
7676

@@ -81,18 +81,18 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
8181

8282
// use col_shape in the im2col and col2im (or vol2col and col2vol)
8383
// calculation
84-
// col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
84+
// col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w}
8585
size_t data_dim = filter_shape_vec.size() - 2;
8686
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
87-
col_shape_vec[0] = output->dims()[1];
87+
col_shape_vec[0] = output->dims()[1] / groups;
8888
for (size_t j = 0; j < data_dim; ++j) {
8989
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
9090
col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
9191
}
9292
DDim col_shape(framework::make_ddim(col_shape_vec));
9393

9494
// use col_matrix_shape in the gemm calculation
95-
// size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
95+
// size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w)
9696
DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
9797

9898
Tensor col;
@@ -111,7 +111,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
111111
// input matrix size: (m, h * w) or (m, d * h * w)
112112
DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
113113

114-
// filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
114+
// filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
115115
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
116116
filter.Resize(filter_matrix_shape);
117117

@@ -121,6 +121,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
121121
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
122122
set_zero(dev_ctx, output, static_cast<T>(0));
123123

124+
int in_step = static_cast<int>(input->dims()[1]) / groups;
125+
int out_step = static_cast<int>(output->dims()[1]) / groups;
124126
math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
125127
math::Col2VolFunctor<DeviceContext, T> col2vol;
126128

@@ -133,22 +135,29 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
133135
// output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
134136
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
135137

136-
// col_matrix = filter * input_batch
137-
// of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
138-
blas.MatMul(filter, true, input_batch, false, static_cast<T>(1.0),
139-
&col_matrix, static_cast<T>(0.0));
140-
141-
if (data_dim == 2U) {
142-
// col2im: col_matrix -> dy
143-
// from (c * k_h * k_w, h * w) to (c, o_h, o_w)
144-
col2im(dev_ctx, col, dilations, strides,
145-
std::vector<int>{paddings[0], paddings[1], paddings[0],
146-
paddings[1]},
147-
&output_batch);
148-
} else if (data_dim == 3U) {
149-
// col2vol: col_matrix -> dy
150-
// from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
151-
col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
138+
for (int g = 0; g < groups; g++) {
139+
Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
140+
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
141+
Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
142+
143+
// col_matrix = filter_slice * input_slice
144+
// of shape (c/g * k_h * k_w, h * w)
145+
// or (c/g * k_d * k_h * k_w, d * h * w)
146+
blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
147+
&col_matrix, static_cast<T>(0.0));
148+
149+
if (data_dim == 2U) {
150+
// col2im: col_matrix -> dy
151+
// from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w)
152+
col2im(dev_ctx, col, dilations, strides,
153+
std::vector<int>{paddings[0], paddings[1], paddings[0],
154+
paddings[1]},
155+
&out_slice);
156+
} else if (data_dim == 3U) {
157+
// col2vol: col_matrix -> dy
158+
// from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w)
159+
col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice);
160+
}
152161
}
153162
}
154163
}
@@ -174,6 +183,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
174183
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
175184
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
176185
std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
186+
int groups = context.Attr<int>("groups");
177187

178188
const int batch_size = static_cast<int>(input->dims()[0]);
179189

@@ -205,9 +215,11 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
205215
// input matrix size: (m, h * w) or (m, d * h * w)
206216
DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
207217

208-
// filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
209-
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
218+
// filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
219+
DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups};
210220
filter.Resize(filter_matrix_shape);
221+
int in_step = static_cast<int>(input->dims()[1]) / groups;
222+
int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
211223

212224
// convolution transpose grad on input:
213225
// im2col + gemm (similar to conv-forward)
@@ -233,7 +245,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
233245
if (input_grad) {
234246
input_grad->mutable_data<T>(context.GetPlace());
235247
}
236-
if (filter_grad) { // filter size (m, c, k_h, k_w)
248+
if (filter_grad) { // filter size (m, c/g, k_h, k_w)
237249
filter_grad->mutable_data<T>(context.GetPlace());
238250
set_zero(dev_ctx, filter_grad, static_cast<T>(0));
239251
filter_grad_ = *filter_grad;
@@ -268,8 +280,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
268280
// or
269281
// (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
270282
// d, h, w)
271-
blas.MatMul(filter, false, col_matrix, false, static_cast<T>(1.0),
272-
&input_grad_batch, static_cast<T>(0.0));
283+
for (int g = 0; g < groups; g++) {
284+
Tensor input_grad_slice =
285+
input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
286+
Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
287+
Tensor col_matrix_slice =
288+
col_matrix.Slice(g * col_step, (g + 1) * col_step);
289+
290+
blas.MatMul(filter_slice, false, col_matrix_slice, false,
291+
static_cast<T>(1.0), &input_grad_slice,
292+
static_cast<T>(0.0));
293+
}
273294
}
274295
if (filter_grad) {
275296
// input batch
@@ -279,8 +300,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
279300
// or
280301
// (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
281302
// k_h * k_w)
282-
blas.MatMul(in_batch, false, col_matrix, true, static_cast<T>(1.0),
283-
&filter_grad_, static_cast<T>(1.0));
303+
for (int g = 0; g < groups; g++) {
304+
Tensor in_batch_slice =
305+
in_batch.Slice(g * in_step, (g + 1) * in_step);
306+
Tensor filter_grad_slice =
307+
filter_grad_.Slice(g * in_step, (g + 1) * in_step);
308+
Tensor col_matrix_slice =
309+
col_matrix.Slice(g * col_step, (g + 1) * col_step);
310+
blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
311+
static_cast<T>(1.0), &filter_grad_slice,
312+
static_cast<T>(1.0));
313+
}
284314
}
285315
}
286316
}

python/paddle/fluid/layers/nn.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,7 @@ def conv2d_transpose(input,
17081708
padding=0,
17091709
stride=1,
17101710
dilation=1,
1711+
groups=None,
17111712
param_attr=None,
17121713
bias_attr=None,
17131714
use_cudnn=True,
@@ -1778,6 +1779,12 @@ def conv2d_transpose(input,
17781779
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
17791780
contain two integers, (dilation_H, dilation_W). Otherwise, the
17801781
dilation_H = dilation_W = dilation. Default: dilation = 1.
1782+
groups(int): The groups number of the Conv2d transpose layer. Inspired by
1783+
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
1784+
when group=2, the first half of the filters is only connected to the
1785+
first half of the input channels, while the second half of the
1786+
filters is only connected to the second half of the input channels.
1787+
Default: groups=1
17811788
param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
17821789
Default: None
17831790
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
@@ -1832,7 +1839,8 @@ def conv2d_transpose(input,
18321839
filter_size = utils.convert_to_list(filter_size, 2,
18331840
'conv2d_transpose.filter_size')
18341841

1835-
filter_shape = [input_channel, num_filters] + filter_size
1842+
groups = 1 if groups is None else groups
1843+
filter_shape = [input_channel, num_filters / groups] + filter_size
18361844
img_filter = helper.create_parameter(
18371845
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
18381846

0 commit comments

Comments
 (0)