Skip to content

Commit 6e13c86

Browse files
author
Yibing Liu
committed
Enable multiple groups for cudnn conv transpose
1 parent 669c0df commit 6e13c86

File tree

2 files changed

+54
-19
lines changed

2 files changed

+54
-19
lines changed

paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
4444
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
4545
// cudnn v5 does not support dilations
4646
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
47+
int groups = ctx.Attr<int>("groups");
4748
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
4849

4950
const T* input_data = input->data<T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
6465

6566
// (N, M, H, W) or (N, M, D, H, W)
6667
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
67-
layout, framework::vectorize2int(input->dims()));
68+
layout, framework::vectorize2int(input->dims()), groups);
6869
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
6970
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
70-
layout, framework::vectorize2int(output->dims()));
71+
layout, framework::vectorize2int(output->dims()), groups);
7172
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
7273
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
73-
layout, framework::vectorize2int(filter->dims()));
74+
layout, framework::vectorize2int(filter->dims()), groups);
7475
cudnnConvolutionDescriptor_t cudnn_conv_desc =
7576
conv_desc.descriptor<T>(paddings, strides, dilations);
7677

@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
104105
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
105106

106107
// ------------------- cudnn conv transpose forward ---------------------
108+
int input_offset = input->numel() / input->dims()[0] / groups;
109+
int output_offset = output->numel() / output->dims()[0] / groups;
110+
int filter_offset = filter->numel() / groups;
107111
T alpha = 1.0f, beta = 0.0f;
108-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
109-
handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
110-
input_data, cudnn_conv_desc, algo, cudnn_workspace,
111-
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
112+
for (int g = 0; g < groups; g++) {
113+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
114+
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
115+
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
116+
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
117+
cudnn_output_desc, output_data + output_offset * g));
118+
}
112119

113120
// Release the cudnn workspace
114121
paddle::memory::Free(gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
134141
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
135142
// cudnn v5 does not support dilations
136143
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
144+
int groups = ctx.Attr<int>("groups");
137145
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
138146

139147
// ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
145153

146154
// Input: (N, M, H, W) or (N, M, D, H, W)
147155
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
148-
layout, framework::vectorize2int(input->dims()));
156+
layout, framework::vectorize2int(input->dims()), groups);
149157
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
150158
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
151-
layout, framework::vectorize2int(output_grad->dims()));
159+
layout, framework::vectorize2int(output_grad->dims()), groups);
152160
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
153161
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
154-
layout, framework::vectorize2int(filter->dims()));
162+
layout, framework::vectorize2int(filter->dims()), groups);
155163

156164
cudnnConvolutionDescriptor_t cudnn_conv_desc =
157165
conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -205,27 +213,39 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
205213
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
206214
// ------------------- cudnn conv backward data ---------------------
207215
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
216+
int input_offset = input->numel() / input->dims()[0] / groups;
217+
int output_grad_offset =
218+
output_grad->numel() / output_grad->dims()[0] / groups;
219+
int filter_offset = filter->numel() / groups;
208220
T alpha = 1.0f, beta = 0.0f;
209221
if (input_grad) {
210222
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
211223
// Because beta is zero, it is unnecessary to reset input_grad.
212-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
213-
handle, &alpha, cudnn_output_desc, output_grad_data,
214-
cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
215-
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
216-
input_grad_data));
224+
for (int g = 0; g < groups; g++) {
225+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
226+
handle, &alpha, cudnn_output_desc,
227+
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
228+
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
229+
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
230+
input_grad_data + input_offset * g));
231+
}
217232
}
218233

219234
// ------------------- cudnn conv backward filter ---------------------
220235
if (filter_grad) {
221236
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
222237
// Because beta is zero, it is unnecessary to reset filter_grad.
223238
// Gradient with respect to the filter
224-
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
225-
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
226-
input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
227-
workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
239+
for (int g = 0; g < groups; g++) {
240+
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
241+
handle, &alpha, cudnn_output_desc,
242+
output_grad_data + output_grad_offset * g, cudnn_input_desc,
243+
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
244+
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
245+
filter_grad_data + filter_offset * g));
246+
}
228247
}
248+
229249
// Release the cudnn workspace
230250
paddle::memory::Free(gpu, cudnn_workspace);
231251
}

python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ def init_op_type(self):
227227
self.op_type = "conv2d_transpose"
228228

229229

230+
class TestCUDNNWithGroups(TestWithGroups):
231+
def init_test_case(self):
232+
self.pad = [1, 1]
233+
self.stride = [1, 1]
234+
self.dilations = [1, 1]
235+
self.groups = 2
236+
self.input_size = [2, 4, 5, 5] # NCHW
237+
f_c = self.input_size[1]
238+
self.filter_size = [f_c, 3, 3, 3]
239+
240+
def init_op_type(self):
241+
self.use_cudnn = True
242+
self.op_type = "conv2d_transpose"
243+
244+
230245
# Please Don't remove the following code.
231246
# Currently, CI use cudnn V5.0 which not support dilation conv.
232247
# class TestCUDNNWithDilation(TestWithDilation):

0 commit comments

Comments
 (0)