Skip to content

Commit 0953cd3

Browse files
authored
Merge pull request #14284 from PaddlePaddle/revert-14043-conv_cudnn_cache
Revert " Exhaustive search for cuDNN conv."
2 parents ce7d9b0 + db8c52d commit 0953cd3

File tree

14 files changed

+74
-381
lines changed

14 files changed

+74
-381
lines changed

paddle/fluid/framework/ir/graph_pattern_detector.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
#include <algorithm>
1615
#include <array>
1716
#include <string>
1817
#include <vector>

paddle/fluid/inference/api/analysis_predictor.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
// limitations under the License.
1414

1515
#pragma once
16-
#include <algorithm>
17-
#include <map>
1816
#include <string>
1917
#include <vector>
2018
#include "paddle/fluid/framework/naive_executor.h"

paddle/fluid/inference/api/helper.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,13 @@
1616

1717
#include <glog/logging.h>
1818
#include <sys/time.h>
19-
#include <algorithm>
2019
#include <chrono> // NOLINT
2120
#include <numeric>
2221
#include <sstream>
2322
#include <string>
2423
#include <vector>
25-
#include "paddle/fluid/inference/api/paddle_inference_api.h"
2624
#include "paddle/fluid/string/printf.h"
25+
#include "paddle_inference_api.h"
2726

2827
namespace paddle {
2928
namespace inference {

paddle/fluid/inference/io.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
5959
bool IsPersistable(const framework::VarDesc* var) {
6060
if (var->Persistable() &&
6161
var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
62-
var->GetType() != framework::proto::VarType::FETCH_LIST &&
63-
var->GetType() != framework::proto::VarType::RAW) {
62+
var->GetType() != framework::proto::VarType::FETCH_LIST) {
6463
return true;
6564
}
6665
return false;

paddle/fluid/operators/add_position_encoding_op.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,9 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
6666
x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
6767
for (int j = 0; j < max_length; ++j) {
6868
for (int k = 0; k < half_size; ++k) {
69-
const double val =
70-
(half_size > 1)
71-
? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
72-
: j / 10000.0;
69+
const double val = (half_size > 1)
70+
? j / pow(10000.0, double(k) / (half_size - 1))
71+
: j / 10000.0;
7372
dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
7473
dst_ptr[half_size + k] =
7574
src_ptr[half_size + k] * alpha + cos(val) * beta;

paddle/fluid/operators/conv_cudnn_op.cu.cc

Lines changed: 19 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,15 @@ limitations under the License. */
1515
#include "paddle/fluid/framework/eigen.h"
1616
#include "paddle/fluid/framework/op_registry.h"
1717
#include "paddle/fluid/memory/memory.h"
18-
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
1918
#include "paddle/fluid/operators/conv_op.h"
2019
#include "paddle/fluid/platform/assert.h"
2120
#include "paddle/fluid/platform/cudnn_helper.h"
2221
#include "paddle/fluid/platform/float16.h"
23-
#include "paddle/fluid/platform/profiler.h"
2422

2523
DEFINE_bool(cudnn_deterministic, false,
2624
"Whether allow using an autotuning algorithm for convolution "
2725
"operator. The autotuning algorithm may be non-deterministic. If "
2826
"true, the algorithm is deterministic.");
29-
DEFINE_uint64(conv_workspace_size_limit, 4096,
30-
"cuDNN convolution workspace limit in MB unit.");
31-
DEFINE_bool(cudnn_exhaustive_search, false,
32-
"Whether enable exhaustive search for cuDNN convolution or "
33-
"not, defalut is False.");
3427

3528
namespace paddle {
3629
namespace operators {
@@ -43,25 +36,13 @@ using DataLayout = platform::DataLayout;
4336
template <typename T>
4437
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
4538

46-
static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
47-
static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
48-
static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
49-
5039
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
5140
static_cast<size_t>(1024) * 1024 * 1024;
5241

53-
static constexpr size_t kNUM_CUDNN_FWD_ALGS =
54-
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
55-
static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
56-
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
57-
static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
58-
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
59-
6042
template <typename T>
6143
class CUDNNConvOpKernel : public framework::OpKernel<T> {
6244
public:
6345
void Compute(const framework::ExecutionContext& ctx) const override {
64-
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
6546
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
6647
"It must use CUDAPlace.");
6748
auto* input = ctx.Input<Tensor>("Input");
@@ -74,8 +55,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
7455
int groups = ctx.Attr<int>("groups");
7556
int64_t user_workspace_size =
7657
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
77-
bool exhaustive_search =
78-
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
7958

8059
const T* input_data = input->data<T>();
8160
const T* filter_data = filter->data<T>();
@@ -141,18 +120,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
141120
// ------------------- cudnn conv workspace ---------------------
142121
size_t workspace_size_in_bytes; // final workspace to allocate.
143122
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
144-
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
145-
int64_t max_user_size =
146-
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
147-
user_workspace_size);
148-
workspace_size_limit = max_user_size * 1024 * 1024;
123+
if (user_workspace_size > 0) {
124+
workspace_size_limit = user_workspace_size * 1024 * 1024;
149125
}
150-
151126
// ------------------- cudnn conv algorithm ---------------------
152127
cudnnConvolutionFwdAlgo_t algo;
128+
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
153129
auto handle = dev_ctx.cudnn_handle();
154130

155-
bool half_float = false;
131+
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
132+
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
133+
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
134+
workspace_size_limit, &algo));
135+
156136
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
157137
// Tensor core is supported since the volta GPU and
158138
// is only enabled when input and filter data are float16
@@ -163,65 +143,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
163143
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
164144
// Currently tensor core is only enabled using this algo
165145
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
166-
half_float = true;
167146
} else {
168147
CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
169148
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
170149
}
171150
#endif
172151

173-
auto x_dims = framework::vectorize(input->dims());
174-
auto f_dims = framework::vectorize(filter->dims());
175-
if ((!exhaustive_search) && (!half_float)) {
176-
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
177-
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
178-
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
179-
workspace_size_limit, &algo));
180-
VLOG(3) << "cuDNN forward algo " << algo;
181-
} else if (exhaustive_search && (!half_float)) {
182-
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
183-
if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
184-
algo_cache =
185-
ctx.scope()
186-
.FindVar(kCUDNNFwdAlgoCache)
187-
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
188-
} else {
189-
algo_cache =
190-
const_cast<framework::Scope&>(ctx.scope())
191-
.Var(kCUDNNFwdAlgoCache)
192-
->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
193-
}
194-
algo = algo_cache->GetAlgorithm(
195-
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
196-
int returned_algo_count;
197-
std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
198-
fwd_perf_stat;
199-
auto cudnn_find_func = [&](void* cudnn_workspace) {
200-
CUDNN_ENFORCE(
201-
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
202-
handle, cudnn_input_desc, input_data, cudnn_filter_desc,
203-
filter_data, cudnn_conv_desc, cudnn_output_desc,
204-
output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
205-
fwd_perf_stat.data(), cudnn_workspace,
206-
workspace_size_limit));
207-
};
208-
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
209-
workspace_size_limit);
210-
211-
VLOG(3) << "Perf result: (algo: stat, time, memory)";
212-
for (int i = 0; i < returned_algo_count; ++i) {
213-
const auto& stat = fwd_perf_stat[i];
214-
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
215-
<< " " << stat.memory;
216-
}
217-
return fwd_perf_stat[0].algo;
218-
});
219-
VLOG(3) << "choose algo " << algo;
220-
} else {
221-
PADDLE_ENFORCE(half_float,
222-
"cuDNN exhaustive search doesn't support half float.");
223-
}
224-
225152
// get workspace size able to allocate
226153
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
227154
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -251,7 +178,6 @@ template <typename T>
251178
class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
252179
public:
253180
void Compute(const framework::ExecutionContext& ctx) const override {
254-
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
255181
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
256182
"It must use CUDAPlace.");
257183
auto input = ctx.Input<Tensor>("Input");
@@ -270,13 +196,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
270196
int groups = ctx.Attr<int>("groups");
271197
int64_t user_workspace_size =
272198
static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
273-
bool exhaustive_search =
274-
FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
275-
if (exhaustive_search && FLAGS_cudnn_deterministic) {
276-
PADDLE_THROW(
277-
"Cann't set exhaustive_search True and "
278-
"FLAGS_cudnn_deterministic True at same time.");
279-
}
280199

281200
// ------------------- cudnn descriptors ---------------------
282201
ScopedTensorDescriptor input_desc;
@@ -344,65 +263,14 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
344263
cudnnConvolutionBwdFilterAlgo_t filter_algo;
345264
size_t workspace_size_in_bytes = 0, tmp_size = 0;
346265
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
347-
if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
348-
int64_t max_user_size =
349-
std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
350-
user_workspace_size);
351-
workspace_size_limit = max_user_size * 1024 * 1024;
266+
if (user_workspace_size > 0) {
267+
workspace_size_limit = user_workspace_size * 1024 * 1024;
352268
}
353269

354-
auto x_dims = framework::vectorize(input->dims());
355-
auto f_dims = framework::vectorize(filter->dims());
270+
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
356271
auto handle = dev_ctx.cudnn_handle();
357272
if (input_grad) {
358-
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
359-
if (exhaustive_search) {
360-
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
361-
if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
362-
data_algo_cache =
363-
ctx.scope()
364-
.FindVar(kCUDNNBwdDataAlgoCache)
365-
->GetMutable<
366-
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
367-
} else {
368-
data_algo_cache =
369-
const_cast<framework::Scope&>(ctx.scope())
370-
.Var(kCUDNNBwdDataAlgoCache)
371-
->GetMutable<
372-
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
373-
}
374-
data_algo = data_algo_cache->GetAlgorithm(
375-
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
376-
int returned_algo_count;
377-
std::array<cudnnConvolutionBwdDataAlgoPerf_t,
378-
kNUM_CUDNN_BWD_DATA_ALGS>
379-
data_perf_stat;
380-
auto cudnn_find_func = [&](void* cudnn_workspace) {
381-
CUDNN_ENFORCE(
382-
platform::dynload::
383-
cudnnFindConvolutionBackwardDataAlgorithmEx(
384-
handle, cudnn_filter_desc, filter_data,
385-
cudnn_output_grad_desc, output_grad_data,
386-
cudnn_conv_desc, cudnn_input_desc, input_grad_data,
387-
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
388-
data_perf_stat.data(), cudnn_workspace,
389-
workspace_size_limit));
390-
};
391-
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_func,
392-
workspace_size_limit);
393-
394-
VLOG(3) << "Perf result: (algo: stat, time, memory)";
395-
for (int i = 0; i < returned_algo_count; ++i) {
396-
const auto& stat = data_perf_stat[i];
397-
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
398-
<< " " << stat.memory;
399-
}
400-
return data_perf_stat[0].algo;
401-
});
402-
VLOG(3) << "cuDNN backward data algo " << data_algo;
403-
} else if (FLAGS_cudnn_deterministic) {
404-
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
405-
} else {
273+
if (!FLAGS_cudnn_deterministic) {
406274
CUDNN_ENFORCE(
407275
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
408276
handle, cudnn_filter_desc,
@@ -415,7 +283,10 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
415283
cudnn_input_desc,
416284
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
417285
workspace_size_limit, &data_algo));
286+
} else {
287+
data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
418288
}
289+
419290
CUDNN_ENFORCE(
420291
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
421292
handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -424,54 +295,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
424295
}
425296

426297
if (filter_grad) {
427-
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
428-
if (exhaustive_search) {
429-
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
430-
if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
431-
f_algo_cache =
432-
ctx.scope()
433-
.FindVar(kCUDNNBwdFilterAlgoCache)
434-
->GetMutable<
435-
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
436-
} else {
437-
f_algo_cache =
438-
const_cast<framework::Scope&>(ctx.scope())
439-
.Var(kCUDNNBwdFilterAlgoCache)
440-
->GetMutable<
441-
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
442-
}
443-
filter_algo = f_algo_cache->GetAlgorithm(
444-
x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
445-
int returned_algo_count;
446-
std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
447-
kNUM_CUDNN_BWD_FILTER_ALGS>
448-
filter_perf_stat;
449-
auto cudnn_find_f_func = [&](void* cudnn_workspace) {
450-
CUDNN_ENFORCE(
451-
platform::dynload::
452-
cudnnFindConvolutionBackwardFilterAlgorithmEx(
453-
handle, cudnn_input_desc, input_data,
454-
cudnn_output_grad_desc, output_grad_data,
455-
cudnn_conv_desc, cudnn_filter_desc,
456-
filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
457-
&returned_algo_count, filter_perf_stat.data(),
458-
cudnn_workspace, workspace_size_limit));
459-
};
460-
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_find_f_func,
461-
workspace_size_limit);
462-
return filter_perf_stat[0].algo;
463-
});
464-
VLOG(3) << "cuDNN backward filter algo " << filter_algo;
465-
} else if (FLAGS_cudnn_deterministic) {
466-
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
467-
} else {
298+
if (!FLAGS_cudnn_deterministic) {
468299
CUDNN_ENFORCE(
469300
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
470301
handle, cudnn_input_desc, cudnn_output_grad_desc,
471302
cudnn_conv_desc, cudnn_filter_desc,
472303
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
473304
workspace_size_limit, &filter_algo));
305+
} else {
306+
filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
474307
}
308+
475309
CUDNN_ENFORCE(
476310
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
477311
handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,

0 commit comments

Comments
 (0)