Skip to content

Commit ce7d9b0

Browse files
authored
Exhaustive search for cuDNN conv. (#14043)
* exhaustive search for cuDNN conv. * Refine code and add unit testing. * Clean code * Fix model load in fluid/inference and unit testing in conv2d * Follow comments.
1 parent fcbe84c commit ce7d9b0

File tree

14 files changed

+381
-74
lines changed

14 files changed

+381
-74
lines changed

paddle/fluid/framework/ir/graph_pattern_detector.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include <algorithm>
1516
#include <array>
1617
#include <string>
1718
#include <vector>

paddle/fluid/inference/api/analysis_predictor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
// limitations under the License.
1414

1515
#pragma once
16+
#include <algorithm>
17+
#include <map>
1618
#include <string>
1719
#include <vector>
1820
#include "paddle/fluid/framework/naive_executor.h"

paddle/fluid/inference/api/helper.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,14 @@
1616

1717
#include <glog/logging.h>
1818
#include <sys/time.h>
19+
#include <algorithm>
1920
#include <chrono> // NOLINT
2021
#include <numeric>
2122
#include <sstream>
2223
#include <string>
2324
#include <vector>
25+
#include "paddle/fluid/inference/api/paddle_inference_api.h"
2426
#include "paddle/fluid/string/printf.h"
25-
#include "paddle_inference_api.h"
2627

2728
namespace paddle {
2829
namespace inference {

paddle/fluid/inference/io.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
5959
bool IsPersistable(const framework::VarDesc* var) {
6060
if (var->Persistable() &&
6161
var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
62-
var->GetType() != framework::proto::VarType::FETCH_LIST) {
62+
var->GetType() != framework::proto::VarType::FETCH_LIST &&
63+
var->GetType() != framework::proto::VarType::RAW) {
6364
return true;
6465
}
6566
return false;

paddle/fluid/operators/add_position_encoding_op.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
6666
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
6767
for (int j = 0; j < max_length; ++j) {
6868
for (int k = 0; k < half_size; ++k) {
69-
const double val = (half_size > 1)
70-
? j / pow(10000.0, double(k) / (half_size - 1))
71-
: j / 10000.0;
69+
const double val =
70+
(half_size > 1)
71+
? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
72+
: j / 10000.0;
7273
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
7374
dst_ptr[half_size + k] =
7475
src_ptr[half_size + k] * alpha + cos(val) * beta;

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 185 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,22 @@ limitations under the License. */
1515
#include "paddle/fluid/framework/eigen.h"
1616
#include "paddle/fluid/framework/op_registry.h"
1717
#include "paddle/fluid/memory/memory.h"
18+
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
1819
#include "paddle/fluid/operators/conv_op.h"
1920
#include "paddle/fluid/platform/assert.h"
2021
#include "paddle/fluid/platform/cudnn_helper.h"
2122
#include "paddle/fluid/platform/float16.h"
23+
#include "paddle/fluid/platform/profiler.h"
2224

2325
DEFINE_bool(cudnn_deterministic, false,
2426
"Whether allow using an autotuning algorithm for convolution "
2527
"operator. The autotuning algorithm may be non-deterministic. If "
2628
"true, the algorithm is deterministic.");
29+
DEFINE_uint64(conv_workspace_size_limit, 4096,
30+
"cuDNN convolution workspace limit in MB unit.");
31+
DEFINE_bool(cudnn_exhaustive_search, false,
32+
"Whether enable exhaustive search for cuDNN convolution or "
33+
"not, defalut is False.");
2734

2835
namespace paddle {
2936
namespace operators {
@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
3643
template <typename T>
3744
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
3845

46+
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
47+
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
48+
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
49+
3950
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
4051
static_cast<size_t>(1024) * 1024 * 1024;
4152

53+
static constexpr size_t kNUM_CUDNN_FWD_ALGS =
54+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
55+
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
56+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
57+
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
58+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
59+
4260
template <typename T>
4361
class CUDNNConvOpKernel : public framework::OpKernel<T> {
4462
public:
4563
void Compute(const framework::ExecutionContext& ctx) const override {
64+
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
4665
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
4766
"It must use CUDAPlace.");
4867
auto* input = ctx.Input<Tensor>("Input");
@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
5574
int groups = ctx.Attr<int>("groups");
5675
int64_t user_workspace_size =
5776
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
77+
bool exhaustive_search =
78+
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
5879

5980
const T* input_data = input->data<T>();
6081
const T* filter_data = filter->data<T>();
@@ -120,19 +141,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
120141
// ------------------- cudnn conv workspace ---------------------
121142
size_t workspace_size_in_bytes; // final workspace to allocate.
122143
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
123-
if (user_workspace_size > 0) {
124-
workspace_size_limit = user_workspace_size * 1024 * 1024;
144+
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
145+
int64_t max_user_size =
146+
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
147+
user_workspace_size);
148+
workspace_size_limit = max_user_size * 1024 * 1024;
125149
}
150+
126151
// ------------------- cudnn conv algorithm ---------------------
127152
cudnnConvolutionFwdAlgo_t algo;
128-
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
129153
auto handle = dev_ctx.cudnn_handle();
130154

131-
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
132-
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
133-
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
134-
workspace_size_limit, &algo));
135-
155+
bool half_float = false;
136156
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
137157
// Tensor core is supported since the volta GPU and
138158
// is only enabled when input and filter data are float16
@@ -143,12 +163,65 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
143163
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
144164
// Currently tensor core is only enabled using this algo
145165
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
166+
half_float = true;
146167
} else {
147168
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
148169
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
149170
}
150171
#endif
151172

173+
auto x_dims = framework::vectorize(input->dims());
174+
auto f_dims = framework::vectorize(filter->dims());
175+
if ((!exhaustive_search) && (!half_float)) {
176+
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
177+
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
178+
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
179+
workspace_size_limit, &algo));
180+
VLOG(3) << "cuDNN forward algo " << algo;
181+
} else if (exhaustive_search && (!half_float)) {
182+
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
183+
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
184+
algo_cache =
185+
ctx.scope()
186+
.FindVar(kCUDNNFwdAlgoCache)
187+
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
188+
} else {
189+
algo_cache =
190+
const_cast<framework::Scope&>(ctx.scope())
191+
.Var(kCUDNNFwdAlgoCache)
192+
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
193+
}
194+
algo = algo_cache->GetAlgorithm(
195+
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
196+
int returned_algo_count;
197+
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
198+
fwd_perf_stat;
199+
auto cudnn_find_func = [&](void* cudnn_workspace) {
200+
CUDNN_ENFORCE(
201+
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
202+
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
203+
filter_data, cudnn_conv_desc, cudnn_output_desc,
204+
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
205+
fwd_perf_stat.data(), cudnn_workspace,
206+
workspace_size_limit));
207+
};
208+
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
209+
workspace_size_limit);
210+
211+
VLOG(3) << "Perf result: (algo: stat, time, memory)";
212+
for (int i = 0; i < returned_algo_count; ++i) {
213+
const auto& stat = fwd_perf_stat[i];
214+
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
215+
<< " " << stat.memory;
216+
}
217+
return fwd_perf_stat[0].algo;
218+
});
219+
VLOG(3) << "choose algo " << algo;
220+
} else {
221+
PADDLE_ENFORCE(half_float,
222+
"cuDNN exhaustive search doesn't support half float.");
223+
}
224+
152225
// get workspace size able to allocate
153226
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
154227
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -178,6 +251,7 @@ template <typename T>
178251
class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
179252
public:
180253
void Compute(const framework::ExecutionContext& ctx) const override {
254+
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
181255
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
182256
"It must use CUDAPlace.");
183257
auto input = ctx.Input<Tensor>("Input");
@@ -196,6 +270,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
196270
int groups = ctx.Attr<int>("groups");
197271
int64_t user_workspace_size =
198272
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
273+
bool exhaustive_search =
274+
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
275+
if (exhaustive_search && FLAGS_cudnn_deterministic) {
276+
PADDLE_THROW(
277+
"Cann't set exhaustive_search True and "
278+
"FLAGS_cudnn_deterministic True at same time.");
279+
}
199280

200281
// ------------------- cudnn descriptors ---------------------
201282
ScopedTensorDescriptor input_desc;
@@ -263,14 +344,65 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
263344
cudnnConvolutionBwdFilterAlgo_t filter_algo;
264345
size_t workspace_size_in_bytes = 0, tmp_size = 0;
265346
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
266-
if (user_workspace_size > 0) {
267-
workspace_size_limit = user_workspace_size * 1024 * 1024;
347+
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
348+
int64_t max_user_size =
349+
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
350+
user_workspace_size);
351+
workspace_size_limit = max_user_size * 1024 * 1024;
268352
}
269353

270-
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
354+
auto x_dims = framework::vectorize(input->dims());
355+
auto f_dims = framework::vectorize(filter->dims());
271356
auto handle = dev_ctx.cudnn_handle();
272357
if (input_grad) {
273-
if (!FLAGS_cudnn_deterministic) {
358+
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
359+
if (exhaustive_search) {
360+
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
361+
if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
362+
data_algo_cache =
363+
ctx.scope()
364+
.FindVar(kCUDNNBwdDataAlgoCache)
365+
->GetMutable<
366+
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
367+
} else {
368+
data_algo_cache =
369+
const_cast<framework::Scope&>(ctx.scope())
370+
.Var(kCUDNNBwdDataAlgoCache)
371+
->GetMutable<
372+
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
373+
}
374+
data_algo = data_algo_cache->GetAlgorithm(
375+
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
376+
int returned_algo_count;
377+
std::array<cudnnConvolutionBwdDataAlgoPerf_t,
378+
kNUM_CUDNN_BWD_DATA_ALGS>
379+
data_perf_stat;
380+
auto cudnn_find_func = [&](void* cudnn_workspace) {
381+
CUDNN_ENFORCE(
382+
platform::dynload::
383+
cudnnFindConvolutionBackwardDataAlgorithmEx(
384+
handle, cudnn_filter_desc, filter_data,
385+
cudnn_output_grad_desc, output_grad_data,
386+
cudnn_conv_desc, cudnn_input_desc, input_grad_data,
387+
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
388+
data_perf_stat.data(), cudnn_workspace,
389+
workspace_size_limit));
390+
};
391+
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
392+
workspace_size_limit);
393+
394+
VLOG(3) << "Perf result: (algo: stat, time, memory)";
395+
for (int i = 0; i < returned_algo_count; ++i) {
396+
const auto& stat = data_perf_stat[i];
397+
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
398+
<< " " << stat.memory;
399+
}
400+
return data_perf_stat[0].algo;
401+
});
402+
VLOG(3) << "cuDNN backward data algo " << data_algo;
403+
} else if (FLAGS_cudnn_deterministic) {
404+
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
405+
} else {
274406
CUDNN_ENFORCE(
275407
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
276408
handle, cudnn_filter_desc,
@@ -283,10 +415,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
283415
cudnn_input_desc,
284416
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
285417
workspace_size_limit, &data_algo));
286-
} else {
287-
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
288418
}
289-
290419
CUDNN_ENFORCE(
291420
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
292421
handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -295,17 +424,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
295424
}
296425

297426
if (filter_grad) {
298-
if (!FLAGS_cudnn_deterministic) {
427+
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
428+
if (exhaustive_search) {
429+
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
430+
if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
431+
f_algo_cache =
432+
ctx.scope()
433+
.FindVar(kCUDNNBwdFilterAlgoCache)
434+
->GetMutable<
435+
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
436+
} else {
437+
f_algo_cache =
438+
const_cast<framework::Scope&>(ctx.scope())
439+
.Var(kCUDNNBwdFilterAlgoCache)
440+
->GetMutable<
441+
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
442+
}
443+
filter_algo = f_algo_cache->GetAlgorithm(
444+
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
445+
int returned_algo_count;
446+
std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
447+
kNUM_CUDNN_BWD_FILTER_ALGS>
448+
filter_perf_stat;
449+
auto cudnn_find_f_func = [&](void* cudnn_workspace) {
450+
CUDNN_ENFORCE(
451+
platform::dynload::
452+
cudnnFindConvolutionBackwardFilterAlgorithmEx(
453+
handle, cudnn_input_desc, input_data,
454+
cudnn_output_grad_desc, output_grad_data,
455+
cudnn_conv_desc, cudnn_filter_desc,
456+
filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
457+
&returned_algo_count, filter_perf_stat.data(),
458+
cudnn_workspace, workspace_size_limit));
459+
};
460+
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_f_func,
461+
workspace_size_limit);
462+
return filter_perf_stat[0].algo;
463+
});
464+
VLOG(3) << "cuDNN backward filter algo " << filter_algo;
465+
} else if (FLAGS_cudnn_deterministic) {
466+
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
467+
} else {
299468
CUDNN_ENFORCE(
300469
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
301470
handle, cudnn_input_desc, cudnn_output_grad_desc,
302471
cudnn_conv_desc, cudnn_filter_desc,
303472
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
304473
workspace_size_limit, &filter_algo));
305-
} else {
306-
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
307474
}
308-
309475
CUDNN_ENFORCE(
310476
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
311477
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,

0 commit comments

Comments
 (0)