PaddlePaddle
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎cmake/external/lite.cmake
Lines changed: 19 additions & 3 deletions b/‎cmake/external/lite.cmake
Lines changed: 19 additions & 3 deletions
diff --git a/‎paddle/fluid/inference/analysis/argument.h
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/inference/analysis/argument.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/analysis/ir_pass_manager.cc
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/inference/analysis/ir_pass_manager.cc
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
Lines changed: 20 additions & 2 deletions b/‎paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
Lines changed: 20 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/api/analysis_config.cc
Lines changed: 30 additions & 1 deletion b/‎paddle/fluid/inference/api/analysis_config.cc
Lines changed: 30 additions & 1 deletion
diff --git a/‎paddle/fluid/inference/api/analysis_predictor.cc
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/inference/api/analysis_predictor.cc
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/paddle_analysis_config.h
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/inference/api/paddle_analysis_config.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/lite/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/inference/lite/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/lite/engine.cc
Lines changed: 13 additions & 7 deletions b/‎paddle/fluid/inference/lite/engine.cc
Lines changed: 13 additions & 7 deletions
@@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+# lite subgraph compilation depends on CUDNN_ROOT,
+# so include(cudnn) needs to be in front of include(third_party/lite)
+include(cudnn)              # set cudnn libraries, must before configure
 include(third_party)        # download, build, install third_party
 
 if(WITH_DISTRIBUTE)
@@ -173,7 +176,6 @@ if(NOT WIN32)
 endif()
 
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
 
 if(WITH_GPU)
     include(cuda)
 
@@ -18,14 +18,27 @@ if(NOT LINUX OR NOT WITH_MKL)
   return()
 endif()
 
+if(XPU_SDK_ROOT)
+  set(LITE_WITH_XPU ON)
+  include_directories("${XPU_SDK_ROOT}/XTDK/include")
+  include_directories("${XPU_SDK_ROOT}/XTCL/include")
+  add_definitions(-DPADDLE_WITH_XPU)
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
+endif()
+
 if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   include(ExternalProject)
   set(LITE_PROJECT extern_lite)
   set(LITE_SOURCES_DIR ${THIRD_PARTY_PATH}/lite)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f)
+    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
+  endif()
+
+  if(NOT CUDA_ARCH_NAME)
+    set(CUDA_ARCH_NAME "Auto")
   endif()
 
   # No quotes, so cmake can resolve it as a command with arguments.
@@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                          -DCUDNN_ROOT=${CUDNN_ROOT}
                          -DLITE_WITH_STATIC_CUDA=OFF
                          -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
+                         -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
                          -DLITE_WITH_ARM=OFF)
 
   ExternalProject_Add(
@@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
 include_directories(${LITE_SOURCE_DIR})
 include_directories(${LITE_BINARY_DIR})
 
-function(external_lite_static_libs alias path)
+function(external_lite_libs alias path)
   add_library(${alias} SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
                ${path})
@@ -88,7 +103,8 @@ function(external_lite_static_libs alias path)
   endif()
 endfunction()
 
-external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
@@ -200,6 +200,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
                       AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
+
+  DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
+  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
 
@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("predictor_id", new int(argument->predictor_id()));
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
+      pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
+      pass->Set("use_xpu", new bool(argument->use_xpu()));
+      pass->Set("xpu_l3_workspace_size",
+                new int(argument->xpu_l3_workspace_size()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
 
@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
+  bool use_xpu = Get<bool>("use_xpu");
+  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+
+  lite_api::TargetType target_type;
+  if (use_gpu) {
+    target_type = TARGET(kCUDA);
+  } else if (use_xpu) {
+    target_type = TARGET(kXPU);
+  } else {
+    target_type = TARGET(kX86);
+  }
+
   paddle::lite_api::PrecisionType precision_type =
-      enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
+      enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
   serialize_params(&config.param, scope, repetitive_params);
   config.model = program->Proto()->SerializeAsString();
   config.valid_places = {
+      // Notice: The ordering here determines the device where the
+      // input tensor of the Lite engine is located, and then affects
+      // whether tensor sharing is feasible.
       paddle::lite::Place({target_type, precision_type}),
+      paddle::lite::Place({target_type, PRECISION(kInt64)}),
       paddle::lite::Place({target_type, PRECISION(kFloat)}),
       paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator(
   op_desc->SetAttr("engine_key", unique_key);
   op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
   op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
+  op_desc->SetAttr("zero_copy", Get<bool>("zero_copy"));
 }
 
 void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
 
@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
+void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+  use_xpu_ = true;
+  xpu_l3_workspace_size_ = l3_workspace_size;
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(lite_precision_mode_);
   CP_MEMBER(lite_passes_filter_);
   CP_MEMBER(lite_ops_filter_);
+  CP_MEMBER(lite_zero_copy_);
+
+  CP_MEMBER(use_xpu_);
+  CP_MEMBER(xpu_l3_workspace_size_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -342,6 +352,22 @@ void AnalysisConfig::Update() {
     }
   }
 
+  if (use_xpu_) {
+#ifndef PADDLE_WITH_XPU
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use an XPU device, but Paddle was not compiled "
+        "with XPU-runtime."));
+#endif
+    if (!use_lite_) {
+      LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
+                      "subgraph mode, please make sure you have enabled it.";
+    }
+    PADDLE_ENFORCE_EQ(use_gpu_, false,
+                      platform::errors::Unavailable(
+                          "Currently, XPU and GPU cannot be enabled in the "
+                          "same analysis configuration."));
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << cpu_math_library_num_threads_;
 
   ss << use_lite_;
+  ss << use_xpu_;
+  ss << xpu_l3_workspace_size_;
 
   return ss.str();
 }
@@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() {
 }
 
 void AnalysisConfig::EnableLiteEngine(
-    AnalysisConfig::Precision precision_mode,
+    AnalysisConfig::Precision precision_mode, bool zero_copy,
     const std::vector<std::string> &passes_filter,
     const std::vector<std::string> &ops_filter) {
   use_lite_ = true;
   lite_precision_mode_ = precision_mode;
   lite_passes_filter_ = passes_filter;
   lite_ops_filter_ = ops_filter;
+  lite_zero_copy_ = zero_copy;
   Update();
 }
 
 
@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
+    argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
+    argument_.SetUseXpu(config_.use_xpu_);
+    argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
 
@@ -176,6 +176,8 @@ struct AnalysisConfig {
   ///
   ///
   void DisableGpu();
+
+  void EnableXpu(int l3_workspace_size = 0xfffc00);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -319,6 +321,7 @@ struct AnalysisConfig {
   ///
   void EnableLiteEngine(
       AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      bool zero_copy = false,
       const std::vector<std::string>& passes_filter = {},
       const std::vector<std::string>& ops_filter = {});
 
@@ -562,6 +565,11 @@ struct AnalysisConfig {
   std::vector<std::string> lite_passes_filter_;
   std::vector<std::string> lite_ops_filter_;
   Precision lite_precision_mode_;
+  bool lite_zero_copy_;
+
+  bool thread_local_stream_{false};
+  bool use_xpu_{false};
+  int xpu_l3_workspace_size_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
 
@@ -1,5 +1,9 @@
+if(XPU_SDK_ROOT)
+  set(XPU_DEPS xpuapi xpurt)
+endif()
+
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
-cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
@@ -16,12 +16,11 @@
 #define LITE_WITH_CUDA 1
 #endif
 
-#include "paddle/fluid/inference/lite/engine.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
+#ifdef PADDLE_WITH_XPU
+#define LITE_WITH_XPU 1
+#endif
 
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
+#include "paddle/fluid/inference/lite/engine.h"
 #include "lite/api/paddle_use_passes.h"
 
 namespace paddle {
@@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
 
 paddle::lite::Predictor* EngineManager::Create(const std::string& name,
                                                const EngineConfig& cfg) {
-  auto* p = new paddle::lite::Predictor();
+  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
 #ifdef PADDLE_WITH_CUDA
-  paddle::lite::Env<TARGET(kCUDA)>::Init();
+    paddle::lite::Env<TARGET(kCUDA)>::Init();
 #endif
+  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+#ifdef PADDLE_WITH_XPU
+    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
+        cfg.xpu_l3_workspace_size;
+#endif
+  }
+  auto* p = new paddle::lite::Predictor();
   p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
            cfg.model_type, cfg.model_from_memory);
   engines_[name].reset(p);
Original file line number	Diff line number	Diff line change
`@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() {`
`447`	`447`	`argument_.SetLitePrecisionMode(config_.lite_precision_mode_);`
`448`	`448`	`argument_.SetLitePassesFilter(config_.lite_passes_filter_);`
`449`	`449`	`argument_.SetLiteOpsFilter(config_.lite_ops_filter_);`
	`450`	`+ argument_.SetLiteZeroCopy(config_.lite_zero_copy_);`
	`451`	`+ argument_.SetUseXpu(config_.use_xpu_);`
	`452`	`+ argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);`
`450`	`453`	`LOG(INFO) << "Lite subgraph engine is enabled";`
`451`	`454`	`}`
`452`	`455`