@@ -15,22 +15,15 @@ limitations under the License. */
15
15
#include " paddle/fluid/framework/eigen.h"
16
16
#include " paddle/fluid/framework/op_registry.h"
17
17
#include " paddle/fluid/memory/memory.h"
18
- #include " paddle/fluid/operators/conv_cudnn_op_cache.h"
19
18
#include " paddle/fluid/operators/conv_op.h"
20
19
#include " paddle/fluid/platform/assert.h"
21
20
#include " paddle/fluid/platform/cudnn_helper.h"
22
21
#include " paddle/fluid/platform/float16.h"
23
- #include " paddle/fluid/platform/profiler.h"
24
22
25
23
DEFINE_bool (cudnn_deterministic, false ,
26
24
" Whether allow using an autotuning algorithm for convolution "
27
25
" operator. The autotuning algorithm may be non-deterministic. If "
28
26
" true, the algorithm is deterministic." );
29
- DEFINE_uint64 (conv_workspace_size_limit, 4096 ,
30
- " cuDNN convolution workspace limit in MB unit." );
31
- DEFINE_bool (cudnn_exhaustive_search, false ,
32
- " Whether enable exhaustive search for cuDNN convolution or "
33
- " not, defalut is False." );
34
27
35
28
namespace paddle {
36
29
namespace operators {
@@ -43,25 +36,13 @@ using DataLayout = platform::DataLayout;
43
36
template <typename T>
44
37
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
45
38
46
- static constexpr char kCUDNNFwdAlgoCache [] = " kCUDNNFwdAlgoCache" ;
47
- static constexpr char kCUDNNBwdDataAlgoCache [] = " kCUDNNBwdDataAlgoCache" ;
48
- static constexpr char kCUDNNBwdFilterAlgoCache [] = " kCUDNNBwdFilterAlgoCache" ;
49
-
50
39
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
51
40
static_cast <size_t >(1024 ) * 1024 * 1024 ;
52
41
53
- static constexpr size_t kNUM_CUDNN_FWD_ALGS =
54
- CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
55
- static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
56
- CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
57
- static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
58
- CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
59
-
60
42
template <typename T>
61
43
class CUDNNConvOpKernel : public framework ::OpKernel<T> {
62
44
public:
63
45
void Compute (const framework::ExecutionContext& ctx) const override {
64
- auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
65
46
PADDLE_ENFORCE (platform::is_gpu_place (ctx.GetPlace ()),
66
47
" It must use CUDAPlace." );
67
48
auto * input = ctx.Input <Tensor>(" Input" );
@@ -74,8 +55,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
74
55
int groups = ctx.Attr <int >(" groups" );
75
56
int64_t user_workspace_size =
76
57
static_cast <size_t >(ctx.Attr <int >(" workspace_size_MB" ));
77
- bool exhaustive_search =
78
- FLAGS_cudnn_exhaustive_search || ctx.Attr <bool >(" exhaustive_search" );
79
58
80
59
const T* input_data = input->data <T>();
81
60
const T* filter_data = filter->data <T>();
@@ -141,18 +120,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
141
120
// ------------------- cudnn conv workspace ---------------------
142
121
size_t workspace_size_in_bytes; // final workspace to allocate.
143
122
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES ;
144
- if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0 ) {
145
- int64_t max_user_size =
146
- std::max (static_cast <int64_t >(FLAGS_conv_workspace_size_limit),
147
- user_workspace_size);
148
- workspace_size_limit = max_user_size * 1024 * 1024 ;
123
+ if (user_workspace_size > 0 ) {
124
+ workspace_size_limit = user_workspace_size * 1024 * 1024 ;
149
125
}
150
-
151
126
// ------------------- cudnn conv algorithm ---------------------
152
127
cudnnConvolutionFwdAlgo_t algo;
128
+ auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
153
129
auto handle = dev_ctx.cudnn_handle ();
154
130
155
- bool half_float = false ;
131
+ CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardAlgorithm (
132
+ handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
133
+ cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
134
+ workspace_size_limit, &algo));
135
+
156
136
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
157
137
// Tensor core is supported since the volta GPU and
158
138
// is only enabled when input and filter data are float16
@@ -163,65 +143,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
163
143
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
164
144
// Currently tensor core is only enabled using this algo
165
145
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
166
- half_float = true ;
167
146
} else {
168
147
CUDNN_ENFORCE (platform::dynload::cudnnSetConvolutionMathType (
169
148
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
170
149
}
171
150
#endif
172
151
173
- auto x_dims = framework::vectorize (input->dims ());
174
- auto f_dims = framework::vectorize (filter->dims ());
175
- if ((!exhaustive_search) && (!half_float)) {
176
- CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardAlgorithm (
177
- handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
178
- cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
179
- workspace_size_limit, &algo));
180
- VLOG (3 ) << " cuDNN forward algo " << algo;
181
- } else if (exhaustive_search && (!half_float)) {
182
- AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr ;
183
- if (ctx.scope ().FindVar (kCUDNNFwdAlgoCache )) {
184
- algo_cache =
185
- ctx.scope ()
186
- .FindVar (kCUDNNFwdAlgoCache )
187
- ->GetMutable <AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
188
- } else {
189
- algo_cache =
190
- const_cast <framework::Scope&>(ctx.scope ())
191
- .Var (kCUDNNFwdAlgoCache )
192
- ->GetMutable <AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
193
- }
194
- algo = algo_cache->GetAlgorithm (
195
- x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
196
- int returned_algo_count;
197
- std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS >
198
- fwd_perf_stat;
199
- auto cudnn_find_func = [&](void * cudnn_workspace) {
200
- CUDNN_ENFORCE (
201
- platform::dynload::cudnnFindConvolutionForwardAlgorithmEx (
202
- handle, cudnn_input_desc, input_data, cudnn_filter_desc,
203
- filter_data, cudnn_conv_desc, cudnn_output_desc,
204
- output_data, kNUM_CUDNN_FWD_ALGS , &returned_algo_count,
205
- fwd_perf_stat.data (), cudnn_workspace,
206
- workspace_size_limit));
207
- };
208
- dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_func,
209
- workspace_size_limit);
210
-
211
- VLOG (3 ) << " Perf result: (algo: stat, time, memory)" ;
212
- for (int i = 0 ; i < returned_algo_count; ++i) {
213
- const auto & stat = fwd_perf_stat[i];
214
- VLOG (3 ) << stat.algo << " : " << stat.status << " " << stat.time
215
- << " " << stat.memory ;
216
- }
217
- return fwd_perf_stat[0 ].algo ;
218
- });
219
- VLOG (3 ) << " choose algo " << algo;
220
- } else {
221
- PADDLE_ENFORCE (half_float,
222
- " cuDNN exhaustive search doesn't support half float." );
223
- }
224
-
225
152
// get workspace size able to allocate
226
153
CUDNN_ENFORCE (platform::dynload::cudnnGetConvolutionForwardWorkspaceSize (
227
154
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -251,7 +178,6 @@ template <typename T>
251
178
class CUDNNConvGradOpKernel : public framework ::OpKernel<T> {
252
179
public:
253
180
void Compute (const framework::ExecutionContext& ctx) const override {
254
- auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
255
181
PADDLE_ENFORCE (platform::is_gpu_place (ctx.GetPlace ()),
256
182
" It must use CUDAPlace." );
257
183
auto input = ctx.Input <Tensor>(" Input" );
@@ -270,13 +196,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
270
196
int groups = ctx.Attr <int >(" groups" );
271
197
int64_t user_workspace_size =
272
198
static_cast <size_t >(ctx.Attr <int >(" workspace_size_MB" ));
273
- bool exhaustive_search =
274
- FLAGS_cudnn_exhaustive_search || ctx.Attr <bool >(" exhaustive_search" );
275
- if (exhaustive_search && FLAGS_cudnn_deterministic) {
276
- PADDLE_THROW (
277
- " Cann't set exhaustive_search True and "
278
- " FLAGS_cudnn_deterministic True at same time." );
279
- }
280
199
281
200
// ------------------- cudnn descriptors ---------------------
282
201
ScopedTensorDescriptor input_desc;
@@ -344,65 +263,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
344
263
cudnnConvolutionBwdFilterAlgo_t filter_algo;
345
264
size_t workspace_size_in_bytes = 0 , tmp_size = 0 ;
346
265
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES ;
347
- if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0 ) {
348
- int64_t max_user_size =
349
- std::max (static_cast <int64_t >(FLAGS_conv_workspace_size_limit),
350
- user_workspace_size);
351
- workspace_size_limit = max_user_size * 1024 * 1024 ;
266
+ if (user_workspace_size > 0 ) {
267
+ workspace_size_limit = user_workspace_size * 1024 * 1024 ;
352
268
}
353
269
354
- auto x_dims = framework::vectorize (input->dims ());
355
- auto f_dims = framework::vectorize (filter->dims ());
270
+ auto & dev_ctx = ctx.template device_context <platform::CUDADeviceContext>();
356
271
auto handle = dev_ctx.cudnn_handle ();
357
272
if (input_grad) {
358
- T* input_grad_data = input_grad->mutable_data <T>(ctx.GetPlace ());
359
- if (exhaustive_search) {
360
- AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
361
- if (ctx.scope ().FindVar (kCUDNNBwdDataAlgoCache )) {
362
- data_algo_cache =
363
- ctx.scope ()
364
- .FindVar (kCUDNNBwdDataAlgoCache )
365
- ->GetMutable <
366
- AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
367
- } else {
368
- data_algo_cache =
369
- const_cast <framework::Scope&>(ctx.scope ())
370
- .Var (kCUDNNBwdDataAlgoCache )
371
- ->GetMutable <
372
- AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
373
- }
374
- data_algo = data_algo_cache->GetAlgorithm (
375
- x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
376
- int returned_algo_count;
377
- std::array<cudnnConvolutionBwdDataAlgoPerf_t,
378
- kNUM_CUDNN_BWD_DATA_ALGS >
379
- data_perf_stat;
380
- auto cudnn_find_func = [&](void * cudnn_workspace) {
381
- CUDNN_ENFORCE (
382
- platform::dynload::
383
- cudnnFindConvolutionBackwardDataAlgorithmEx (
384
- handle, cudnn_filter_desc, filter_data,
385
- cudnn_output_grad_desc, output_grad_data,
386
- cudnn_conv_desc, cudnn_input_desc, input_grad_data,
387
- kNUM_CUDNN_BWD_DATA_ALGS , &returned_algo_count,
388
- data_perf_stat.data (), cudnn_workspace,
389
- workspace_size_limit));
390
- };
391
- dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_func,
392
- workspace_size_limit);
393
-
394
- VLOG (3 ) << " Perf result: (algo: stat, time, memory)" ;
395
- for (int i = 0 ; i < returned_algo_count; ++i) {
396
- const auto & stat = data_perf_stat[i];
397
- VLOG (3 ) << stat.algo << " : " << stat.status << " " << stat.time
398
- << " " << stat.memory ;
399
- }
400
- return data_perf_stat[0 ].algo ;
401
- });
402
- VLOG (3 ) << " cuDNN backward data algo " << data_algo;
403
- } else if (FLAGS_cudnn_deterministic) {
404
- data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
405
- } else {
273
+ if (!FLAGS_cudnn_deterministic) {
406
274
CUDNN_ENFORCE (
407
275
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm (
408
276
handle, cudnn_filter_desc,
@@ -415,7 +283,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
415
283
cudnn_input_desc,
416
284
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
417
285
workspace_size_limit, &data_algo));
286
+ } else {
287
+ data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
418
288
}
289
+
419
290
CUDNN_ENFORCE (
420
291
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize (
421
292
handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -424,54 +295,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
424
295
}
425
296
426
297
if (filter_grad) {
427
- T* filter_grad_data = filter_grad->mutable_data <T>(ctx.GetPlace ());
428
- if (exhaustive_search) {
429
- AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
430
- if (ctx.scope ().FindVar (kCUDNNBwdFilterAlgoCache )) {
431
- f_algo_cache =
432
- ctx.scope ()
433
- .FindVar (kCUDNNBwdFilterAlgoCache )
434
- ->GetMutable <
435
- AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
436
- } else {
437
- f_algo_cache =
438
- const_cast <framework::Scope&>(ctx.scope ())
439
- .Var (kCUDNNBwdFilterAlgoCache )
440
- ->GetMutable <
441
- AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
442
- }
443
- filter_algo = f_algo_cache->GetAlgorithm (
444
- x_dims, f_dims, strides, paddings, dilations, 0 , [&]() {
445
- int returned_algo_count;
446
- std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
447
- kNUM_CUDNN_BWD_FILTER_ALGS >
448
- filter_perf_stat;
449
- auto cudnn_find_f_func = [&](void * cudnn_workspace) {
450
- CUDNN_ENFORCE (
451
- platform::dynload::
452
- cudnnFindConvolutionBackwardFilterAlgorithmEx (
453
- handle, cudnn_input_desc, input_data,
454
- cudnn_output_grad_desc, output_grad_data,
455
- cudnn_conv_desc, cudnn_filter_desc,
456
- filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS ,
457
- &returned_algo_count, filter_perf_stat.data (),
458
- cudnn_workspace, workspace_size_limit));
459
- };
460
- dev_ctx.RunCudnnFuncWithWorkspace (cudnn_find_f_func,
461
- workspace_size_limit);
462
- return filter_perf_stat[0 ].algo ;
463
- });
464
- VLOG (3 ) << " cuDNN backward filter algo " << filter_algo;
465
- } else if (FLAGS_cudnn_deterministic) {
466
- filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
467
- } else {
298
+ if (!FLAGS_cudnn_deterministic) {
468
299
CUDNN_ENFORCE (
469
300
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm (
470
301
handle, cudnn_input_desc, cudnn_output_grad_desc,
471
302
cudnn_conv_desc, cudnn_filter_desc,
472
303
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
473
304
workspace_size_limit, &filter_algo));
305
+ } else {
306
+ filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
474
307
}
308
+
475
309
CUDNN_ENFORCE (
476
310
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize (
477
311
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
0 commit comments