@@ -15,15 +15,22 @@ limitations under the License. */
15
15
#include " paddle/fluid/framework/eigen.h"
16
16
#include " paddle/fluid/framework/op_registry.h"
17
17
#include " paddle/fluid/memory/memory.h"
18
+ #include " paddle/fluid/operators/conv_cudnn_op_cache.h"
18
19
#include " paddle/fluid/operators/conv_op.h"
19
20
#include " paddle/fluid/platform/assert.h"
20
21
#include " paddle/fluid/platform/cudnn_helper.h"
21
22
#include " paddle/fluid/platform/float16.h"
23
+ #include " paddle/fluid/platform/profiler.h"
22
24
23
25
DEFINE_bool (cudnn_deterministic, false ,
24
26
" Whether allow using an autotuning algorithm for convolution "
25
27
" operator. The autotuning algorithm may be non-deterministic. If "
26
28
" true, the algorithm is deterministic." );
29
+ DEFINE_uint64 (conv_workspace_size_limit, 4096 ,
30
+ " cuDNN convolution workspace limit in MB unit." );
31
+ DEFINE_bool (cudnn_exhaustive_search, false ,
32
+ " Whether enable exhaustive search for cuDNN convolution or "
33
+ " not, defalut is False." );
27
34
28
35
namespace paddle {
29
36
namespace operators {
@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
36
43
template <typename T>
37
44
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
38
45
46
+ static constexpr char kCUDNNFwdAlgoCache [] = " kCUDNNFwdAlgoCache" ;
47
+ static constexpr char kCUDNNBwdDataAlgoCache [] = " kCUDNNBwdDataAlgoCache" ;
48
+ static constexpr char kCUDNNBwdFilterAlgoCache [] = " kCUDNNBwdFilterAlgoCache" ;
49
+
39
50
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
40
51
static_cast <size_t >(1024 ) * 1024 * 1024 ;
41
52
53
+ static constexpr size_t kNUM_CUDNN_FWD_ALGS =
54
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
55
+ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
56
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
57
+ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
58
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
59
+
42
60
template <typename T>
43
61
class CUDNNConvOpKernel : public framework ::OpKernel<T> {
44
62
public:
45
63
void Compute (const framework::ExecutionContext& ctx) const override {
64
+ auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
46
65
PADDLE_ENFORCE (platform::is_gpu_place (ctx.GetPlace ()),
47
66
" It must use CUDAPlace." );
48
67
auto * input = ctx.Input <Tensor>(" Input" );
@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
55
74
int groups = ctx.Attr <int >(" groups" );
56
75
int64_t user_workspace_size =
57
76
static_cast <size_t >(ctx.Attr <int >(" workspace_size_MB" ));
77
+ bool exhaustive_search =
78
+ FLAGS_cudnn_exhaustive_search || ctx.Attr <bool >(" exhaustive_search" );
58
79
59
80
const T* input_data = input->data <T>();
60
81
const T* filter_data = filter->data <T>();
@@ -120,19 +141,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
120
141
// ------------------- cudnn conv workspace ---------------------
121
142
size_t workspace_size_in_bytes; // final workspace to allocate.
122
143
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES ;
123
- if (user_workspace_size > 0 ) {
124
- workspace_size_limit = user_workspace_size * 1024 * 1024 ;
144
+ if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0 ) {
145
+ int64_t max_user_size =
146
+ std::max (static_cast <int64_t >(FLAGS_conv_workspace_size_limit),
147
+ user_workspace_size);
148
+ workspace_size_limit = max_user_size * 1024 * 1024 ;
125
149
}
150
+
126
151
// ------------------- cudnn conv algorithm ---------------------
127
152
cudnnConvolutionFwdAlgo_t algo;
128
- auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
129
153
auto handle = dev_ctx.cudnn_handle ();
130
154
131
- CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardAlgorithm (
132
- handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
133
- cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
134
- workspace_size_limit, &algo));
135
-
155
+ bool half_float = false ;
136
156
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
137
157
// Tensor core is supported since the volta GPU and
138
158
// is only enabled when input and filter data are float16
@@ -143,12 +163,65 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
143
163
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
144
164
// Currently tensor core is only enabled using this algo
145
165
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
166
+ half_float = true ;
146
167
} else {
147
168
CUDNN_ENFORCE (platform::dynload::cudnnSetConvolutionMathType (
148
169
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
149
170
}
150
171
#endif
151
172
173
+ auto x_dims = framework::vectorize (input->dims ());
174
+ auto f_dims = framework::vectorize (filter->dims ());
175
+ if ((!exhaustive_search) && (!half_float)) {
176
+ CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardAlgorithm (
177
+ handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
178
+ cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
179
+ workspace_size_limit, &algo));
180
+ VLOG (3 ) << " cuDNN forward algo " << algo;
181
+ } else if (exhaustive_search && (!half_float)) {
182
+ AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr ;
183
+ if (ctx.scope ().FindVar (kCUDNNFwdAlgoCache )) {
184
+ algo_cache =
185
+ ctx.scope ()
186
+ .FindVar (kCUDNNFwdAlgoCache )
187
+ ->GetMutable <AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
188
+ } else {
189
+ algo_cache =
190
+ const_cast <framework::Scope&>(ctx.scope ())
191
+ .Var (kCUDNNFwdAlgoCache )
192
+ ->GetMutable <AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
193
+ }
194
+ algo = algo_cache->GetAlgorithm (
195
+ x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
196
+ int returned_algo_count;
197
+ std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS >
198
+ fwd_perf_stat;
199
+ auto cudnn_find_func = [&](void * cudnn_workspace) {
200
+ CUDNN_ENFORCE (
201
+ platform::dynload::cudnnFindConvolutionForwardAlgorithmEx (
202
+ handle, cudnn_input_desc, input_data, cudnn_filter_desc,
203
+ filter_data, cudnn_conv_desc, cudnn_output_desc,
204
+ output_data, kNUM_CUDNN_FWD_ALGS , &returned_algo_count,
205
+ fwd_perf_stat.data (), cudnn_workspace,
206
+ workspace_size_limit));
207
+ };
208
+ dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_func,
209
+ workspace_size_limit);
210
+
211
+ VLOG (3 ) << " Perf result: (algo: stat, time, memory)" ;
212
+ for (int i = 0 ; i < returned_algo_count; ++i) {
213
+ const auto & stat = fwd_perf_stat[i];
214
+ VLOG (3 ) << stat.algo << " : " << stat.status << " " << stat.time
215
+ << " " << stat.memory ;
216
+ }
217
+ return fwd_perf_stat[0 ].algo ;
218
+ });
219
+ VLOG (3 ) << " choose algo " << algo;
220
+ } else {
221
+ PADDLE_ENFORCE (half_float,
222
+ " cuDNN exhaustive search doesn't support half float." );
223
+ }
224
+
152
225
// get workspace size able to allocate
153
226
CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardWorkspaceSize (
154
227
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -178,6 +251,7 @@ template <typename T>
178
251
class CUDNNConvGradOpKernel : public framework ::OpKernel<T> {
179
252
public:
180
253
void Compute (const framework::ExecutionContext& ctx) const override {
254
+ auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
181
255
PADDLE_ENFORCE (platform::is_gpu_place (ctx.GetPlace ()),
182
256
" It must use CUDAPlace." );
183
257
auto input = ctx.Input <Tensor>(" Input" );
@@ -196,6 +270,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
196
270
int groups = ctx.Attr <int >(" groups" );
197
271
int64_t user_workspace_size =
198
272
static_cast <size_t >(ctx.Attr <int >(" workspace_size_MB" ));
273
+ bool exhaustive_search =
274
+ FLAGS_cudnn_exhaustive_search || ctx.Attr <bool >(" exhaustive_search" );
275
+ if (exhaustive_search && FLAGS_cudnn_deterministic) {
276
+ PADDLE_THROW (
277
+ " Cann't set exhaustive_search True and "
278
+ " FLAGS_cudnn_deterministic True at same time." );
279
+ }
199
280
200
281
// ------------------- cudnn descriptors ---------------------
201
282
ScopedTensorDescriptor input_desc;
@@ -263,14 +344,65 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
263
344
cudnnConvolutionBwdFilterAlgo_t filter_algo;
264
345
size_t workspace_size_in_bytes = 0 , tmp_size = 0 ;
265
346
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES ;
266
- if (user_workspace_size > 0 ) {
267
- workspace_size_limit = user_workspace_size * 1024 * 1024 ;
347
+ if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0 ) {
348
+ int64_t max_user_size =
349
+ std::max (static_cast <int64_t >(FLAGS_conv_workspace_size_limit),
350
+ user_workspace_size);
351
+ workspace_size_limit = max_user_size * 1024 * 1024 ;
268
352
}
269
353
270
- auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
354
+ auto x_dims = framework::vectorize (input->dims ());
355
+ auto f_dims = framework::vectorize (filter->dims ());
271
356
auto handle = dev_ctx.cudnn_handle ();
272
357
if (input_grad) {
273
- if (!FLAGS_cudnn_deterministic) {
358
+ T* input_grad_data = input_grad->mutable_data <T>(ctx.GetPlace ());
359
+ if (exhaustive_search) {
360
+ AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
361
+ if (ctx.scope ().FindVar (kCUDNNBwdDataAlgoCache )) {
362
+ data_algo_cache =
363
+ ctx.scope ()
364
+ .FindVar (kCUDNNBwdDataAlgoCache )
365
+ ->GetMutable <
366
+ AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
367
+ } else {
368
+ data_algo_cache =
369
+ const_cast <framework::Scope&>(ctx.scope ())
370
+ .Var (kCUDNNBwdDataAlgoCache )
371
+ ->GetMutable <
372
+ AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
373
+ }
374
+ data_algo = data_algo_cache->GetAlgorithm (
375
+ x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
376
+ int returned_algo_count;
377
+ std::array<cudnnConvolutionBwdDataAlgoPerf_t,
378
+ kNUM_CUDNN_BWD_DATA_ALGS >
379
+ data_perf_stat;
380
+ auto cudnn_find_func = [&](void * cudnn_workspace) {
381
+ CUDNN_ENFORCE (
382
+ platform::dynload::
383
+ cudnnFindConvolutionBackwardDataAlgorithmEx (
384
+ handle, cudnn_filter_desc, filter_data,
385
+ cudnn_output_grad_desc, output_grad_data,
386
+ cudnn_conv_desc, cudnn_input_desc, input_grad_data,
387
+ kNUM_CUDNN_BWD_DATA_ALGS , &returned_algo_count,
388
+ data_perf_stat.data (), cudnn_workspace,
389
+ workspace_size_limit));
390
+ };
391
+ dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_func,
392
+ workspace_size_limit);
393
+
394
+ VLOG (3 ) << " Perf result: (algo: stat, time, memory)" ;
395
+ for (int i = 0 ; i < returned_algo_count; ++i) {
396
+ const auto & stat = data_perf_stat[i];
397
+ VLOG (3 ) << stat.algo << " : " << stat.status << " " << stat.time
398
+ << " " << stat.memory ;
399
+ }
400
+ return data_perf_stat[0 ].algo ;
401
+ });
402
+ VLOG (3 ) << " cuDNN backward data algo " << data_algo;
403
+ } else if (FLAGS_cudnn_deterministic) {
404
+ data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
405
+ } else {
274
406
CUDNN_ENFORCE (
275
407
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm (
276
408
handle, cudnn_filter_desc,
@@ -283,10 +415,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
283
415
cudnn_input_desc,
284
416
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
285
417
workspace_size_limit, &data_algo));
286
- } else {
287
- data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
288
418
}
289
-
290
419
CUDNN_ENFORCE (
291
420
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize (
292
421
handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -295,17 +424,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
295
424
}
296
425
297
426
if (filter_grad) {
298
- if (!FLAGS_cudnn_deterministic) {
427
+ T* filter_grad_data = filter_grad->mutable_data <T>(ctx.GetPlace ());
428
+ if (exhaustive_search) {
429
+ AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
430
+ if (ctx.scope ().FindVar (kCUDNNBwdFilterAlgoCache )) {
431
+ f_algo_cache =
432
+ ctx.scope ()
433
+ .FindVar (kCUDNNBwdFilterAlgoCache )
434
+ ->GetMutable <
435
+ AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
436
+ } else {
437
+ f_algo_cache =
438
+ const_cast <framework::Scope&>(ctx.scope ())
439
+ .Var (kCUDNNBwdFilterAlgoCache )
440
+ ->GetMutable <
441
+ AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
442
+ }
443
+ filter_algo = f_algo_cache->GetAlgorithm (
444
+ x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
445
+ int returned_algo_count;
446
+ std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
447
+ kNUM_CUDNN_BWD_FILTER_ALGS >
448
+ filter_perf_stat;
449
+ auto cudnn_find_f_func = [&](void * cudnn_workspace) {
450
+ CUDNN_ENFORCE (
451
+ platform::dynload::
452
+ cudnnFindConvolutionBackwardFilterAlgorithmEx (
453
+ handle, cudnn_input_desc, input_data,
454
+ cudnn_output_grad_desc, output_grad_data,
455
+ cudnn_conv_desc, cudnn_filter_desc,
456
+ filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS ,
457
+ &returned_algo_count, filter_perf_stat.data (),
458
+ cudnn_workspace, workspace_size_limit));
459
+ };
460
+ dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_f_func,
461
+ workspace_size_limit);
462
+ return filter_perf_stat[0 ].algo ;
463
+ });
464
+ VLOG (3 ) << " cuDNN backward filter algo " << filter_algo;
465
+ } else if (FLAGS_cudnn_deterministic) {
466
+ filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
467
+ } else {
299
468
CUDNN_ENFORCE (
300
469
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm (
301
470
handle, cudnn_input_desc, cudnn_output_grad_desc,
302
471
cudnn_conv_desc, cudnn_filter_desc,
303
472
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
304
473
workspace_size_limit, &filter_algo));
305
- } else {
306
- filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
307
474
}
308
-
309
475
CUDNN_ENFORCE (
310
476
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize (
311
477
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
0 commit comments