Skip to content

Commit a06bec1

Browse files
authored
Conv cudnn 3d (#5783)
* conv cudnn 3d * update test case * update * update * follow comments and remove groups from helper * update * refine * update * follow comments2 * update * fix compile
1 parent 52a7358 commit a06bec1

File tree

6 files changed

+170
-52
lines changed

6 files changed

+170
-52
lines changed

paddle/operators/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ function(op_library TARGET)
7373
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
7474
endif()
7575

76+
# conv_cudnn_op contains several operators
77+
if ("${TARGET}" STREQUAL "conv_cudnn_op")
78+
set(pybind_flag 1)
79+
# It's enough to just adding one operator to pybind
80+
file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
81+
endif()
82+
7683
# pool_op contains several operators
7784
if ("${TARGET}" STREQUAL "pool_op")
7885
set(pybind_flag 1)

paddle/operators/conv_cudnn_op.cc

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717
namespace paddle {
1818
namespace operators {
1919

20-
class CudnnConvOpMaker : public Conv2DOpMaker {
20+
class CudnnConv2DOpMaker : public Conv2DOpMaker {
2121
public:
22-
CudnnConvOpMaker(framework::OpProto* proto,
23-
framework::OpAttrChecker* op_checker)
22+
CudnnConv2DOpMaker(framework::OpProto* proto,
23+
framework::OpAttrChecker* op_checker)
2424
: Conv2DOpMaker(proto, op_checker) {
2525
AddAttr<int>("workspace_size_MB",
2626
"workspace size for cudnn, in MB, "
@@ -32,16 +32,43 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
3232
}
3333
};
3434

35+
class CudnnConv3DOpMaker : public Conv3DOpMaker {
36+
public:
37+
CudnnConv3DOpMaker(framework::OpProto* proto,
38+
framework::OpAttrChecker* op_checker)
39+
: Conv3DOpMaker(proto, op_checker) {
40+
AddAttr<int>("workspace_size_MB",
41+
"workspace size for cudnn, in MB, "
42+
"workspace is a section of GPU memory which will be "
43+
"allocated/freed each time the operator runs, larger "
44+
"workspace size can increase performance but also requires "
45+
"better hardware. This size should be chosen carefully.")
46+
.SetDefault(4096);
47+
}
48+
};
49+
3550
} // namespace operators
3651
} // namespace paddle
3752

3853
namespace ops = paddle::operators;
39-
REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
40-
ops::ConvOpGrad);
54+
REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
55+
conv2d_cudnn_grad, ops::ConvOpGrad);
56+
57+
REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
58+
conv3d_cudnn_grad, ops::ConvOpGrad);
59+
60+
REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
61+
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
62+
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
63+
REGISTER_OP_CPU_KERNEL(
64+
conv2d_cudnn_grad,
65+
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
66+
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
4167

42-
REGISTER_OP_CPU_KERNEL(conv_cudnn,
68+
REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
4369
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
4470
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
4571
REGISTER_OP_CPU_KERNEL(
46-
conv_cudnn_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
72+
conv3d_cudnn_grad,
73+
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
4774
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);

paddle/operators/conv_cudnn_op.cu.cc

Lines changed: 94 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -56,26 +56,56 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
5656
ScopedFilterDescriptor filter_desc;
5757
ScopedConvolutionDescriptor conv_desc;
5858
DataLayout layout = DataLayout::kNCHW;
59+
if (input->dims().size() == 5) {
60+
layout = DataLayout::kNCDHW;
61+
}
62+
63+
cudnnConvolutionDescriptor_t cudnn_conv_desc =
64+
conv_desc.descriptor<T>(paddings, strides, dilations);
65+
66+
#if CUDNN_VERSION_MIN(7, 0, 0)
67+
// cudnn 7 can support groups, no need to do it mannually
68+
// FIXME(typhoonzero): find a better way to disable groups
69+
// rather than setting it to 1.
70+
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
71+
cudnn_conv_desc, groups));
72+
groups = 1;
73+
#endif
5974

6075
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
6176
layout, framework::vectorize2int(input->dims()), groups);
6277
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
6378
layout, framework::vectorize2int(output->dims()), groups);
6479
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
6580
layout, framework::vectorize2int(filter->dims()), groups);
66-
cudnnConvolutionDescriptor_t cudnn_conv_desc =
67-
conv_desc.descriptor<T>(paddings, strides, dilations);
6881

6982
int input_channels = input->dims()[1];
70-
int input_height = input->dims()[2];
71-
int input_width = input->dims()[3];
72-
int output_channels = output->dims()[1];
73-
int output_height = output->dims()[2];
74-
int output_width = output->dims()[3];
83+
int input_height, input_width, input_depth;
84+
if (input->dims().size() == 5) {
85+
input_depth = input->dims()[2];
86+
input_height = input->dims()[3];
87+
input_width = input->dims()[4];
88+
} else { // dim size is enforced in InferShape
89+
input_depth = 1;
90+
input_height = input->dims()[2];
91+
input_width = input->dims()[3];
92+
}
93+
int output_channels = filter->dims()[0];
94+
int output_height, output_width, output_depth;
95+
if (output->dims().size() == 5) {
96+
output_depth = output->dims()[2];
97+
output_height = output->dims()[3];
98+
output_width = output->dims()[4];
99+
} else {
100+
output_depth = 1;
101+
output_height = output->dims()[2];
102+
output_width = output->dims()[3];
103+
}
75104

76-
int group_offset_in = input_channels / groups * input_height * input_width;
105+
int group_offset_in =
106+
input_channels / groups * input_height * input_width * input_depth;
77107
int group_offset_out =
78-
output_channels / groups * output_height * output_width;
108+
output_channels / groups * output_height * output_width * output_depth;
79109
int group_offset_filter = filter->numel() / groups;
80110
// ------------------- cudnn conv workspace ---------------------
81111
void* cudnn_workspace = nullptr;
@@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
138168
// ------------------- cudnn descriptors ---------------------
139169
ScopedTensorDescriptor input_desc;
140170
ScopedTensorDescriptor output_grad_desc;
141-
ScopedTensorDescriptor input_grad_desc;
142171

143172
ScopedFilterDescriptor filter_desc;
144173
ScopedFilterDescriptor filter_grad_desc;
145174
ScopedConvolutionDescriptor conv_desc;
146175
DataLayout layout = DataLayout::kNCHW;
176+
if (input->dims().size() == 5) {
177+
layout = DataLayout::kNCDHW;
178+
}
179+
180+
cudnnConvolutionDescriptor_t cudnn_conv_desc =
181+
conv_desc.descriptor<T>(paddings, strides, dilations);
182+
183+
#if CUDNN_VERSION_MIN(7, 0, 0)
184+
// cudnn 7 can support groups, no need to do it mannually
185+
// FIXME(typhoonzero): find a better way to disable groups
186+
// rather than setting it to 1.
187+
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
188+
cudnn_conv_desc, groups));
189+
groups = 1;
190+
#endif
147191

148192
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
149193
layout, framework::vectorize2int(input->dims()), groups);
@@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
152196
layout, framework::vectorize2int(output_grad->dims()), groups);
153197
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
154198
layout, framework::vectorize2int(filter->dims()), groups);
155-
cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
156-
cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
157-
158-
cudnnConvolutionDescriptor_t cudnn_conv_desc =
159-
conv_desc.descriptor<T>(paddings, strides, dilations);
160199

161200
int input_channels = input->dims()[1];
162-
int input_height = input->dims()[2];
163-
int input_width = input->dims()[3];
201+
int input_height, input_width, input_depth;
202+
if (input->dims().size() == 5) {
203+
input_depth = input->dims()[2];
204+
input_height = input->dims()[3];
205+
input_width = input->dims()[4];
206+
} else { // dim size is enforced in InferShape
207+
input_depth = 1;
208+
input_height = input->dims()[2];
209+
input_width = input->dims()[3];
210+
}
211+
164212
int output_grad_channels = filter->dims()[0];
165-
int output_grad_height = output_grad->dims()[2];
166-
int output_grad_width = output_grad->dims()[3];
213+
int output_grad_height, output_grad_width, output_grad_depth;
214+
if (input->dims().size() == 5) {
215+
output_grad_depth = output_grad->dims()[2];
216+
output_grad_height = output_grad->dims()[3];
217+
output_grad_width = output_grad->dims()[4];
218+
} else {
219+
output_grad_depth = 1;
220+
output_grad_height = output_grad->dims()[2];
221+
output_grad_width = output_grad->dims()[3];
222+
}
167223

168-
int group_offset_in = input_channels / groups * input_height * input_width;
169-
int group_offset_out =
170-
output_grad_channels / groups * output_grad_height * output_grad_width;
224+
int group_offset_in =
225+
input_channels / groups * input_height * input_width * input_depth;
226+
int group_offset_out = output_grad_channels / groups * output_grad_height *
227+
output_grad_width * output_grad_depth;
171228
int group_offset_filter = filter->numel() / groups;
172229
// ------------------- cudnn backward algorithm ---------------------
173230
cudnnConvolutionBwdDataAlgo_t data_algo;
@@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
180237

181238
auto handle = ctx.cuda_device_context().cudnn_handle();
182239
if (input_grad) {
183-
cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
184-
layout, framework::vectorize2int(input_grad->dims()), groups);
185240
PADDLE_ENFORCE(
186241
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
187242
handle, cudnn_filter_desc,
@@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
190245
cudnn_output_grad_desc, cudnn_conv_desc,
191246
// dxDesc: Handle to the previously initialized output tensor
192247
// descriptor.
193-
cudnn_input_grad_desc,
248+
cudnn_input_desc,
194249
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
195250
workspace_size_limit, &data_algo));
196251
PADDLE_ENFORCE(
197252
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
198253
handle, cudnn_filter_desc, cudnn_output_grad_desc,
199-
cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
254+
cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
200255
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
201256
}
202257

203258
if (filter_grad) {
204-
cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
205-
layout, framework::vectorize2int(filter_grad->dims()), groups);
206259
PADDLE_ENFORCE(
207260
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
208261
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -222,7 +275,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
222275
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
223276
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
224277
// ------------------- cudnn conv backward data ---------------------
225-
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
226278
T alpha = 1.0f, beta = 0.0f;
227279
if (input_grad) {
228280
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -233,21 +285,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
233285
handle, &alpha, cudnn_filter_desc,
234286
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
235287
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
236-
cudnn_workspace, workspace_size_in_bytes, &beta,
237-
cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
288+
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
289+
input_grad_data + i * group_offset_in));
238290
}
239291
}
240292
// ------------------- cudnn conv backward filter ---------------------
241293
if (filter_grad) {
242294
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
243295
// Because beta is zero, it is unnecessary to reset filter_grad.
244-
245296
for (int i = 0; i < groups; i++) {
246297
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
247298
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
248299
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
249300
cudnn_conv_desc, filter_algo, cudnn_workspace,
250-
workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
301+
workspace_size_in_bytes, &beta, cudnn_filter_desc,
251302
filter_grad_data + i * group_offset_filter));
252303
}
253304
}
@@ -259,8 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
259310
} // namespace operators
260311
} // namespace paddle
261312

262-
REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>,
313+
REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
314+
paddle::operators::CudnnConvOpKernel<float>,
315+
paddle::operators::CudnnConvOpKernel<double>);
316+
REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
317+
paddle::operators::CudnnConvGradOpKernel<float>,
318+
paddle::operators::CudnnConvGradOpKernel<double>);
319+
320+
REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
321+
paddle::operators::CudnnConvOpKernel<float>,
263322
paddle::operators::CudnnConvOpKernel<double>);
264-
REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
323+
REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
265324
paddle::operators::CudnnConvGradOpKernel<float>,
266325
paddle::operators::CudnnConvGradOpKernel<double>);

paddle/platform/cudnn_helper.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
116116
case DataLayout::kNCHW:
117117
return CUDNN_TENSOR_NCHW;
118118
case DataLayout::kNCDHW:
119-
return CUDNN_TENSOR_NCHW; // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
119+
return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same
120120
default:
121121
PADDLE_THROW("Unknown cudnn equivalent for order");
122122
}
@@ -143,7 +143,7 @@ class ScopedTensorDescriptor {
143143
strides[i] = dims[i + 1] * strides[i + 1];
144144
}
145145
// Update tensor descriptor dims setting if groups > 1
146-
// FIXME(typhoonzero): Assume using NCHW or NCDHW order
146+
// NOTE: Assume using NCHW or NCDHW order
147147
std::vector<int> dims_with_group(dims.begin(), dims.end()); // copy
148148
if (groups > 1) {
149149
dims_with_group[1] = dims_with_group[1] / groups;
@@ -186,7 +186,6 @@ class ScopedFilterDescriptor {
186186
// width of the filter.
187187
std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
188188
if (groups > 1) {
189-
// M /= groups
190189
kernel_with_group[0] /= groups;
191190
// NOTE: input filter(C) of the filter is already asserted to be C/groups.
192191
}

python/paddle/v2/fluid/tests/test_conv2d_op.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ def conv2d_forward_naive(input, filter, group, conv_param):
1616
out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
1717
out = np.zeros((in_n, out_c, out_h, out_w))
1818

19-
d_bolck_w = (dilation[0] * (f_h - 1) + 1)
20-
d_bolck_h = (dilation[1] * (f_w - 1) + 1)
19+
d_bolck_h = (dilation[0] * (f_h - 1) + 1)
20+
d_bolck_w = (dilation[1] * (f_w - 1) + 1)
2121

2222
input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
2323
mode='constant',
@@ -167,27 +167,27 @@ def init_group(self):
167167
#----------------Conv2dCudnn----------------
168168
class TestCudnn(TestConv2dOp):
169169
def init_op_type(self):
170-
self.op_type = "conv_cudnn"
170+
self.op_type = "conv2d_cudnn"
171171

172172

173173
class TestCudnnWithPad(TestWithPad):
174174
def init_op_type(self):
175-
self.op_type = "conv_cudnn"
175+
self.op_type = "conv2d_cudnn"
176176

177177

178178
class TestCudnnWithStride(TestWithStride):
179179
def init_op_type(self):
180-
self.op_type = "conv_cudnn"
180+
self.op_type = "conv2d_cudnn"
181181

182182

183183
class TestCudnnWithGroup(TestWithGroup):
184184
def init_op_type(self):
185-
self.op_type = "conv_cudnn"
185+
self.op_type = "conv2d_cudnn"
186186

187187

188188
class TestCudnnWith1x1(TestWith1x1):
189189
def init_op_type(self):
190-
self.op_type = "conv_cudnn"
190+
self.op_type = "conv2d_cudnn"
191191

192192

193193
# cudnn v5 does not support dilation conv.

0 commit comments

Comments
 (0)