Skip to content

Commit 09dfc7a

Browse files
author
Wojciech Uss
authored
C-API quantization core 2 (#16396)
* C-API quantization core test=develop Co-authored-by: Sylwester Fraczek <[email protected]> * Decouple Quantizer from AnalysisPredictor test=develop * fixes after review test=develop * renamed mkldnn quantize stuff test=develop * remove ifdef from header file test=develop
1 parent e41d581 commit 09dfc7a

13 files changed

+1089
-29
lines changed

paddle/fluid/inference/CMakeLists.txt

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,18 +37,24 @@ endif(WIN32)
3737

3838
add_subdirectory(api)
3939

40+
if(WITH_MKLDNN)
41+
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc)
42+
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
43+
endif()
44+
4045
set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
4146
set(SHARED_INFERENCE_SRCS
4247
io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
4348
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
49+
${mkldnn_quantizer_src}
4450
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
4551

4652
if(WIN32)
4753
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
48-
analysis_config paddle_pass_builder)
54+
analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
4955
else(WIN32)
5056
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
51-
zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
57+
zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
5258
endif(WIN32)
5359

5460
if(NOT APPLE)
@@ -61,11 +67,11 @@ endif()
6167
if(WIN32)
6268
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
6369
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
64-
analysis_config paddle_pass_builder)
70+
analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
6571
else(WIN32)
6672
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
6773
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
68-
analysis_config paddle_pass_builder)
74+
analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
6975
endif()
7076
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
7177
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})

paddle/fluid/inference/api/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,19 @@ endif()
3333

3434
add_subdirectory(details)
3535

36-
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
36+
if(WITH_MKLDNN)
37+
set(mkldnn_quantizer_src mkldnn_quantizer.cc)
38+
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
39+
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
40+
endif()
41+
42+
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
3743
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
38-
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
44+
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor
3945
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
4046
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
4147
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
42-
analysis_config paddle_pass_builder zero_copy_tensor
48+
paddle_pass_builder zero_copy_tensor
4349
reset_tensor_array)
4450

4551
cc_test(test_paddle_inference_api

paddle/fluid/inference/api/analysis_config.cc

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
108108
// MKLDNN related.
109109
CP_MEMBER(use_mkldnn_);
110110
CP_MEMBER(mkldnn_enabled_op_types_);
111+
// Quantization related.
112+
CP_MEMBER(use_mkldnn_quantizer_);
113+
CP_MEMBER(mkldnn_quantizer_config_);
111114

112115
CP_MEMBER(use_anakin_);
113116
CP_MEMBER(anakin_max_batchsize_);
@@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() {
148151
Update();
149152
}
150153

154+
void AnalysisConfig::EnableMkldnnQuantizer() {
155+
#ifdef PADDLE_WITH_MKLDNN
156+
if (!mkldnn_quantizer_config_)
157+
mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig());
158+
use_mkldnn_quantizer_ = true;
159+
#else
160+
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
161+
use_mkldnn_quantizer_ = false;
162+
#endif
163+
164+
Update();
165+
}
166+
167+
std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
168+
const {
169+
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
170+
"MkldnnQuantizer was not enabled yet.");
171+
return mkldnn_quantizer_config_;
172+
}
173+
151174
void AnalysisConfig::EnableTensorRtEngine(
152175
int workspace_size, int max_batch_size, int min_subgraph_size,
153176
AnalysisConfig::Precision precision_mode, bool use_static) {
@@ -224,15 +247,27 @@ void AnalysisConfig::Update() {
224247
#endif
225248
}
226249

227-
if (enable_memory_optim_) {
228-
auto analysis_passes = pass_builder()->AnalysisPasses();
229-
auto memory_opti_pass_name = "memory_optimize_pass";
230-
bool already_exists =
231-
std::find(analysis_passes.begin(), analysis_passes.end(),
232-
memory_opti_pass_name) != analysis_passes.end();
233-
if (!already_exists) {
234-
pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
250+
// Quantization passes must come after all other optimization passes
251+
if (use_mkldnn_quantizer_) {
252+
if (!enable_ir_optim_) {
253+
LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization "
254+
"is enabled.";
235255
}
256+
#ifdef PADDLE_WITH_MKLDNN
257+
pass_builder()->EnableMkldnnQuantizer();
258+
#else
259+
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
260+
use_mkldnn_quantizer_ = false;
261+
#endif
262+
}
263+
264+
#ifdef PADDLE_WITH_MKLDNN
265+
// Do not optimize before quantization
266+
if (enable_memory_optim_ && !use_mkldnn_quantizer_) {
267+
#else
268+
if (enable_memory_optim_) {
269+
#endif
270+
pass_builder()->AppendAnalysisPass("memory_optimize_pass");
236271
}
237272

238273
if (use_anakin_) {
@@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
277312
for (auto &item : mkldnn_enabled_op_types_) ss << item;
278313
ss << ";";
279314

315+
ss << use_mkldnn_quantizer_;
280316
ss << model_from_memory_;
281317

282318
ss << enable_ir_optim_;

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <fstream>
1919
#include <memory>
2020
#include <string>
21+
#include <utility>
2122
#include <vector>
2223
#include "paddle/fluid/framework/feed_fetch_method.h"
2324
#include "paddle/fluid/framework/feed_fetch_type.h"
@@ -35,8 +36,13 @@
3536
#include "paddle/fluid/memory/memcpy.h"
3637
#include "paddle/fluid/platform/cpu_helper.h"
3738
#include "paddle/fluid/platform/gpu_info.h"
39+
#include "paddle/fluid/platform/place.h"
3840
#include "paddle/fluid/platform/profiler.h"
3941

42+
#ifdef PADDLE_WITH_MKLDNN
43+
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
44+
#endif
45+
4046
#if PADDLE_WITH_TENSORRT
4147
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
4248
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
@@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
341347
return true;
342348
}
343349

344-
// NOTE All the members in AnalysisConfig should be copied to Argument.
345-
void AnalysisPredictor::OptimizeInferenceProgram() {
346-
status_program_optimized_ = true;
347-
350+
void AnalysisPredictor::PrepareArgument() {
348351
argument_.SetUseGPU(config_.use_gpu());
349352
argument_.SetGPUDeviceId(config_.gpu_device_id());
350353
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
@@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
390393
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
391394
}
392395

396+
#ifdef PADDLE_WITH_MKLDNN
397+
if (config_.mkldnn_quantizer_enabled()) {
398+
LOG(INFO) << "Quantization is enabled";
399+
argument_.SetQuantizeEnabledOpTypes(
400+
config_.mkldnn_quantizer_config()->enabled_op_types());
401+
argument_.SetQuantizeExcludedOpIds(
402+
config_.mkldnn_quantizer_config()->excluded_op_ids());
403+
}
404+
#endif
405+
393406
auto passes = config_.pass_builder()->AllPasses();
394407
if (!config_.ir_optim()) {
395408
passes.clear();
@@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
398411
argument_.SetIrAnalysisPasses(passes);
399412
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
400413
argument_.SetScopeNotOwned(scope_.get());
414+
}
415+
416+
// NOTE All the members in AnalysisConfig should be copied to Argument.
417+
void AnalysisPredictor::OptimizeInferenceProgram() {
418+
status_program_optimized_ = true;
419+
420+
PrepareArgument();
401421
Analyzer().Run(&argument_);
402422

403423
PADDLE_ENFORCE(argument_.scope_valid());
@@ -439,12 +459,31 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
439459
}
440460

441461
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
442-
if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
462+
auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
463+
464+
if (!predictor_p->Init(nullptr)) {
465+
return nullptr;
466+
}
467+
468+
if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) {
443469
return nullptr;
444470
}
471+
445472
return predictor;
446473
}
447474

475+
bool AnalysisPredictor::MkldnnQuantize() {
476+
#if PADDLE_WITH_MKLDNN
477+
if (!mkldnn_quantizer_)
478+
mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer(
479+
*this, config_.mkldnn_quantizer_config());
480+
return mkldnn_quantizer_->Quantize();
481+
#else
482+
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
483+
return false;
484+
#endif
485+
}
486+
448487
void AnalysisPredictor::PrepareFeedFetch() {
449488
PADDLE_ENFORCE_NOT_NULL(sub_scope_);
450489
CreateFeedFetchVar(sub_scope_);
@@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() {
703742
scope_->DeleteScope(sub_scope_);
704743
}
705744

745+
#if PADDLE_WITH_MKLDNN
746+
if (mkldnn_quantizer_) {
747+
delete mkldnn_quantizer_;
748+
mkldnn_quantizer_ = nullptr;
749+
}
750+
#endif
751+
706752
// TODO(Superjomn) deduce the directory path.
707753
std::string out_path = inference::analysis::GetMemoryCachePath(
708754
config_.model_dir(), config_.prog_file());

paddle/fluid/inference/api/analysis_predictor.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor {
7070
void CreateFeedFetchVar(framework::Scope *scope);
7171
void PrepareFeedFetch();
7272

73+
void PrepareArgument();
7374
void OptimizeInferenceProgram();
7475

7576
Argument &analysis_argument() { return argument_; }
@@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor {
8384

8485
std::string GetSerializedProgram() const override;
8586

87+
bool MkldnnQuantize();
88+
8689
protected:
8790
// For memory optimization.
8891
bool need_collect_var_shapes_for_memory_optim();
@@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor {
143146
std::vector<framework::OpDesc *> fetches_;
144147
std::map<size_t, std::string> idx2fetches_;
145148

149+
#if PADDLE_WITH_MKLDNN
150+
// Helper class to perform quantization
151+
class MkldnnQuantizer;
152+
MkldnnQuantizer *mkldnn_quantizer_{nullptr};
153+
154+
#if PADDLE_WITH_TESTING
155+
friend class MkldnnQuantizerTest;
156+
#endif
157+
#endif
158+
146159
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
147160
// concurrency problems, wrong results and memory leak, so cache them.
148161
std::vector<framework::LoDTensor> feed_tensors_;

0 commit comments

Comments
 (0)