@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
44
44
std::vector<int > paddings = ctx.Attr <std::vector<int >>(" paddings" );
45
45
// cudnn v5 does not support dilations
46
46
std::vector<int > dilations = ctx.Attr <std::vector<int >>(" dilations" );
47
+ int groups = ctx.Attr <int >(" groups" );
47
48
int user_workspace_size = ctx.Attr <int >(" workspace_size_MB" );
48
49
49
50
const T* input_data = input->data <T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
64
65
65
66
// (N, M, H, W) or (N, M, D, H, W)
66
67
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor <T>(
67
- layout, framework::vectorize2int (input->dims ()));
68
+ layout, framework::vectorize2int (input->dims ()), groups );
68
69
// (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
69
70
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor <T>(
70
- layout, framework::vectorize2int (output->dims ()));
71
+ layout, framework::vectorize2int (output->dims ()), groups );
71
72
// (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
72
73
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor <T>(
73
- layout, framework::vectorize2int (filter->dims ()));
74
+ layout, framework::vectorize2int (filter->dims ()), groups );
74
75
cudnnConvolutionDescriptor_t cudnn_conv_desc =
75
76
conv_desc.descriptor <T>(paddings, strides, dilations);
76
77
@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
104
105
cudnn_workspace = paddle::memory::Alloc (gpu, workspace_size_in_bytes);
105
106
106
107
// ------------------- cudnn conv transpose forward ---------------------
108
+ int input_offset = input->numel () / input->dims ()[0 ] / groups;
109
+ int output_offset = output->numel () / output->dims ()[0 ] / groups;
110
+ int filter_offset = filter->numel () / groups;
107
111
T alpha = 1 .0f , beta = 0 .0f ;
108
- PADDLE_ENFORCE (platform::dynload::cudnnConvolutionBackwardData (
109
- handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
110
- input_data, cudnn_conv_desc, algo, cudnn_workspace,
111
- workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
112
+ for (int g = 0 ; g < groups; g++) {
113
+ PADDLE_ENFORCE (platform::dynload::cudnnConvolutionBackwardData (
114
+ handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
115
+ cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
116
+ algo, cudnn_workspace, workspace_size_in_bytes, &beta,
117
+ cudnn_output_desc, output_data + output_offset * g));
118
+ }
112
119
113
120
// Release the cudnn workspace
114
121
paddle::memory::Free (gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
134
141
std::vector<int > paddings = ctx.Attr <std::vector<int >>(" paddings" );
135
142
// cudnn v5 does not support dilations
136
143
std::vector<int > dilations = ctx.Attr <std::vector<int >>(" dilations" );
144
+ int groups = ctx.Attr <int >(" groups" );
137
145
int user_workspace_size = ctx.Attr <int >(" workspace_size_MB" );
138
146
139
147
// ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
145
153
146
154
// Input: (N, M, H, W) or (N, M, D, H, W)
147
155
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor <T>(
148
- layout, framework::vectorize2int (input->dims ()));
156
+ layout, framework::vectorize2int (input->dims ()), groups );
149
157
// Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
150
158
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor <T>(
151
- layout, framework::vectorize2int (output_grad->dims ()));
159
+ layout, framework::vectorize2int (output_grad->dims ()), groups );
152
160
// Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
153
161
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor <T>(
154
- layout, framework::vectorize2int (filter->dims ()));
162
+ layout, framework::vectorize2int (filter->dims ()), groups );
155
163
156
164
cudnnConvolutionDescriptor_t cudnn_conv_desc =
157
165
conv_desc.descriptor <T>(paddings, strides, dilations);
@@ -205,27 +213,39 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
205
213
cudnn_workspace = paddle::memory::Alloc (gpu, workspace_size_in_bytes);
206
214
// ------------------- cudnn conv backward data ---------------------
207
215
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
216
+ int input_offset = input->numel () / input->dims ()[0 ] / groups;
217
+ int output_grad_offset =
218
+ output_grad->numel () / output_grad->dims ()[0 ] / groups;
219
+ int filter_offset = filter->numel () / groups;
208
220
T alpha = 1 .0f , beta = 0 .0f ;
209
221
if (input_grad) {
210
222
T* input_grad_data = input_grad->mutable_data <T>(ctx.GetPlace ());
211
223
// Because beta is zero, it is unnecessary to reset input_grad.
212
- PADDLE_ENFORCE (platform::dynload::cudnnConvolutionForward (
213
- handle, &alpha, cudnn_output_desc, output_grad_data,
214
- cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
215
- cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
216
- input_grad_data));
224
+ for (int g = 0 ; g < groups; g++) {
225
+ PADDLE_ENFORCE (platform::dynload::cudnnConvolutionForward (
226
+ handle, &alpha, cudnn_output_desc,
227
+ output_grad_data + output_grad_offset * g, cudnn_filter_desc,
228
+ filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
229
+ cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
230
+ input_grad_data + input_offset * g));
231
+ }
217
232
}
218
233
219
234
// ------------------- cudnn conv backward filter ---------------------
220
235
if (filter_grad) {
221
236
T* filter_grad_data = filter_grad->mutable_data <T>(ctx.GetPlace ());
222
237
// Because beta is zero, it is unnecessary to reset filter_grad.
223
238
// Gradient with respect to the filter
224
- PADDLE_ENFORCE (platform::dynload::cudnnConvolutionBackwardFilter (
225
- handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
226
- input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
227
- workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
239
+ for (int g = 0 ; g < groups; g++) {
240
+ PADDLE_ENFORCE (platform::dynload::cudnnConvolutionBackwardFilter (
241
+ handle, &alpha, cudnn_output_desc,
242
+ output_grad_data + output_grad_offset * g, cudnn_input_desc,
243
+ input_data + input_offset * g, cudnn_conv_desc, filter_algo,
244
+ cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
245
+ filter_grad_data + filter_offset * g));
246
+ }
228
247
}
248
+
229
249
// Release the cudnn workspace
230
250
paddle::memory::Free (gpu, cudnn_workspace);
231
251
}
0 commit comments