Skip to content

Commit 45fa686

Browse files
Cherry-pick of lite engine, test=release/1.8 (#25817)
* ignore warnings of external libraries, test=develop (#24193) * fix repeat definitions in liengine.cc, test=develop (#25020) * remove paddle_use_kernel and paddle_use_op. test=develop (#25189) * fix compile for lite subgraph. test=develop (#25285) * [CI] [Lite-Subgraph] CI add lite subgraph check. (#25346) * supports xpu runtime, test=develop (#25554) * fix cmake of lite, test=develop (#25680) * change commit files, test=release/1.8 Co-authored-by: Wilber <[email protected]>
1 parent 01fc84a commit 45fa686

19 files changed

+222
-38
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA)
154154
endif()
155155
endif()
156156

157+
# lite subgraph compilation depends on CUDNN_ROOT,
158+
# so include(cudnn) needs to be in front of include(third_party/lite)
159+
include(cudnn) # set cudnn libraries, must before configure
157160
include(third_party) # download, build, install third_party
158161

159162
if(WITH_DISTRIBUTE)
@@ -173,7 +176,6 @@ if(NOT WIN32)
173176
endif()
174177

175178
include(flags) # set paddle compile flags
176-
include(cudnn) # set cudnn libraries, must before configure
177179

178180
if(WITH_GPU)
179181
include(cuda)

cmake/external/lite.cmake

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,27 @@ if(NOT LINUX OR NOT WITH_MKL)
1818
return()
1919
endif()
2020

21+
if(XPU_SDK_ROOT)
22+
set(LITE_WITH_XPU ON)
23+
include_directories("${XPU_SDK_ROOT}/XTDK/include")
24+
include_directories("${XPU_SDK_ROOT}/XTCL/include")
25+
add_definitions(-DPADDLE_WITH_XPU)
26+
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
27+
LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
28+
endif()
29+
2130
if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
2231
include(ExternalProject)
2332
set(LITE_PROJECT extern_lite)
2433
set(LITE_SOURCES_DIR ${THIRD_PARTY_PATH}/lite)
2534
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
2635

2736
if(NOT LITE_GIT_TAG)
28-
set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f)
37+
set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
38+
endif()
39+
40+
if(NOT CUDA_ARCH_NAME)
41+
set(CUDA_ARCH_NAME "Auto")
2942
endif()
3043

3144
# No quotes, so cmake can resolve it as a command with arguments.
@@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
4356
-DCUDNN_ROOT=${CUDNN_ROOT}
4457
-DLITE_WITH_STATIC_CUDA=OFF
4558
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
59+
-DLITE_WITH_XPU=${LITE_WITH_XPU}
60+
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
4661
-DLITE_WITH_ARM=OFF)
4762

4863
ExternalProject_Add(
@@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
7994
include_directories(${LITE_SOURCE_DIR})
8095
include_directories(${LITE_BINARY_DIR})
8196

82-
function(external_lite_static_libs alias path)
97+
function(external_lite_libs alias path)
8398
add_library(${alias} SHARED IMPORTED GLOBAL)
8499
SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
85100
${path})
@@ -88,7 +103,8 @@ function(external_lite_static_libs alias path)
88103
endif()
89104
endfunction()
90105

91-
external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
106+
external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
107+
set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
92108

93109
add_definitions(-DPADDLE_WITH_LITE)
94110
add_definitions(-DLITE_WITH_LOG)

paddle/fluid/inference/analysis/argument.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,10 @@ struct Argument {
200200
DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
201201
DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
202202
AnalysisConfig::Precision);
203+
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
204+
205+
DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
206+
DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
203207

204208
// Memory optimized related.
205209
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

paddle/fluid/inference/analysis/ir_pass_manager.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument,
146146
pass->Set("predictor_id", new int(argument->predictor_id()));
147147
pass->Set("enable_int8", new bool(enable_int8));
148148
pass->Set("use_gpu", new bool(argument->use_gpu()));
149+
pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
150+
pass->Set("use_xpu", new bool(argument->use_xpu()));
151+
pass->Set("xpu_l3_workspace_size",
152+
new int(argument->xpu_l3_workspace_size()));
149153
}
150154
disable_logs_ = argument->disable_logs();
151155
if (pass_name == "fc_fuse_pass") {

paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine(
242242

243243
bool use_gpu = Get<bool>("use_gpu");
244244
bool enable_int8 = Get<bool>("enable_int8");
245-
lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
245+
bool use_xpu = Get<bool>("use_xpu");
246+
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
247+
248+
lite_api::TargetType target_type;
249+
if (use_gpu) {
250+
target_type = TARGET(kCUDA);
251+
} else if (use_xpu) {
252+
target_type = TARGET(kXPU);
253+
} else {
254+
target_type = TARGET(kX86);
255+
}
256+
246257
paddle::lite_api::PrecisionType precision_type =
247-
enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
258+
enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
259+
248260
serialize_params(&config.param, scope, repetitive_params);
249261
config.model = program->Proto()->SerializeAsString();
250262
config.valid_places = {
263+
// Notice: The ordering here determines the device where the
264+
// input tensor of the Lite engine is located, and then affects
265+
// whether tensor sharing is feasible.
251266
paddle::lite::Place({target_type, precision_type}),
267+
paddle::lite::Place({target_type, PRECISION(kInt64)}),
252268
paddle::lite::Place({target_type, PRECISION(kFloat)}),
253269
paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
254270
};
271+
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
255272
if (dump_model) {
256273
lite::StrToBinaryFile("./model.bin", config.model);
257274
lite::StrToBinaryFile("./param.bin", config.param);
@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator(
283300
op_desc->SetAttr("engine_key", unique_key);
284301
op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
285302
op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
303+
op_desc->SetAttr("zero_copy", Get<bool>("zero_copy"));
286304
}
287305

288306
void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {

paddle/fluid/inference/api/analysis_config.cc

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() {
8888
Update();
8989
}
9090

91+
void AnalysisConfig::EnableXpu(int l3_workspace_size) {
92+
use_xpu_ = true;
93+
xpu_l3_workspace_size_ = l3_workspace_size;
94+
Update();
95+
}
96+
9197
AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
9298
#define CP_MEMBER(member__) member__ = other.member__;
9399

@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
132138
CP_MEMBER(lite_precision_mode_);
133139
CP_MEMBER(lite_passes_filter_);
134140
CP_MEMBER(lite_ops_filter_);
141+
CP_MEMBER(lite_zero_copy_);
142+
143+
CP_MEMBER(use_xpu_);
144+
CP_MEMBER(xpu_l3_workspace_size_);
135145

136146
// profile related.
137147
CP_MEMBER(with_profile_);
@@ -342,6 +352,22 @@ void AnalysisConfig::Update() {
342352
}
343353
}
344354

355+
if (use_xpu_) {
356+
#ifndef PADDLE_WITH_XPU
357+
PADDLE_THROW(platform::errors::Unavailable(
358+
"You tried to use an XPU device, but Paddle was not compiled "
359+
"with XPU-runtime."));
360+
#endif
361+
if (!use_lite_) {
362+
LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
363+
"subgraph mode, please make sure you have enabled it.";
364+
}
365+
PADDLE_ENFORCE_EQ(use_gpu_, false,
366+
platform::errors::Unavailable(
367+
"Currently, XPU and GPU cannot be enabled in the "
368+
"same analysis configuration."));
369+
}
370+
345371
if (ir_debug_) {
346372
pass_builder()->TurnOnDebug();
347373
}
@@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
385411
ss << cpu_math_library_num_threads_;
386412

387413
ss << use_lite_;
414+
ss << use_xpu_;
415+
ss << xpu_l3_workspace_size_;
388416

389417
return ss.str();
390418
}
@@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() {
460488
}
461489

462490
void AnalysisConfig::EnableLiteEngine(
463-
AnalysisConfig::Precision precision_mode,
491+
AnalysisConfig::Precision precision_mode, bool zero_copy,
464492
const std::vector<std::string> &passes_filter,
465493
const std::vector<std::string> &ops_filter) {
466494
use_lite_ = true;
467495
lite_precision_mode_ = precision_mode;
468496
lite_passes_filter_ = passes_filter;
469497
lite_ops_filter_ = ops_filter;
498+
lite_zero_copy_ = zero_copy;
470499
Update();
471500
}
472501

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() {
447447
argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
448448
argument_.SetLitePassesFilter(config_.lite_passes_filter_);
449449
argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
450+
argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
451+
argument_.SetUseXpu(config_.use_xpu_);
452+
argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
450453
LOG(INFO) << "Lite subgraph engine is enabled";
451454
}
452455

paddle/fluid/inference/api/paddle_analysis_config.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ struct AnalysisConfig {
176176
///
177177
///
178178
void DisableGpu();
179+
180+
void EnableXpu(int l3_workspace_size = 0xfffc00);
179181
///
180182
/// \brief A boolean state telling whether the GPU is turned on.
181183
///
@@ -319,6 +321,7 @@ struct AnalysisConfig {
319321
///
320322
void EnableLiteEngine(
321323
AnalysisConfig::Precision precision_mode = Precision::kFloat32,
324+
bool zero_copy = false,
322325
const std::vector<std::string>& passes_filter = {},
323326
const std::vector<std::string>& ops_filter = {});
324327

@@ -562,6 +565,11 @@ struct AnalysisConfig {
562565
std::vector<std::string> lite_passes_filter_;
563566
std::vector<std::string> lite_ops_filter_;
564567
Precision lite_precision_mode_;
568+
bool lite_zero_copy_;
569+
570+
bool thread_local_stream_{false};
571+
bool use_xpu_{false};
572+
int xpu_l3_workspace_size_;
565573

566574
// mkldnn related.
567575
int mkldnn_cache_capacity_{0};
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
if(XPU_SDK_ROOT)
2+
set(XPU_DEPS xpuapi xpurt)
3+
endif()
4+
15
cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
2-
cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
3-
cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
6+
cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
7+
cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
48
cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
59
cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)

paddle/fluid/inference/lite/engine.cc

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,11 @@
1616
#define LITE_WITH_CUDA 1
1717
#endif
1818

19-
#include "paddle/fluid/inference/lite/engine.h"
20-
#include "lite/core/context.h"
21-
#include "lite/core/device_info.h"
19+
#ifdef PADDLE_WITH_XPU
20+
#define LITE_WITH_XPU 1
21+
#endif
2222

23-
#include "lite/api/paddle_use_kernels.h"
24-
#include "lite/api/paddle_use_ops.h"
23+
#include "paddle/fluid/inference/lite/engine.h"
2524
#include "lite/api/paddle_use_passes.h"
2625

2726
namespace paddle {
@@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
4342

4443
paddle::lite::Predictor* EngineManager::Create(const std::string& name,
4544
const EngineConfig& cfg) {
46-
auto* p = new paddle::lite::Predictor();
45+
if (cfg.valid_places.front().target == TARGET(kCUDA)) {
4746
#ifdef PADDLE_WITH_CUDA
48-
paddle::lite::Env<TARGET(kCUDA)>::Init();
47+
paddle::lite::Env<TARGET(kCUDA)>::Init();
4948
#endif
49+
} else if (cfg.valid_places.front().target == TARGET(kXPU)) {
50+
#ifdef PADDLE_WITH_XPU
51+
paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
52+
cfg.xpu_l3_workspace_size;
53+
#endif
54+
}
55+
auto* p = new paddle::lite::Predictor();
5056
p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
5157
cfg.model_type, cfg.model_from_memory);
5258
engines_[name].reset(p);

0 commit comments

Comments
 (0)