fix inference on gpu out of mem (#14414)

Superjomn · web-flow · commit 7796f65f8989 · 2018-11-16T10:56:22.000+08:00
* fix inference on gpu out of mem

the transfer logic in operator.cc will keep creating new scopes.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -315,7 +315,6 @@ endif()
 
 if (ON_INFER)
     message(STATUS "On inference mode, will take place some specific optimization.")
-    add_definitions(-DPADDLE_ON_INFERENCE)
 else()
     #TODO(luotao), combine this warning with `make inference_lib_dist` command.
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
@@ -218,3 +218,7 @@ endif(WITH_GRPC)
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
+
+if(ON_INFER)
+    add_definitions(-DPADDLE_ON_INFERENCE)
+endif(ON_INFER)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
@@ -70,6 +70,16 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 }
 
 void NaiveExecutor::Run() {
+#ifndef PADDLE_ON_INFERENCE
+  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
+                              "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
+                              "variables will be reused to save the allocation "
+                              "overhead.";
+  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
+                              "setting the cmake flag ON_INFER=ON if you are "
+                              "running Paddle Inference";
+#endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
             << " on scope " << scope_;
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
@@ -63,6 +63,8 @@ struct OpKernelType {
         place_(dev_ctx.GetPlace()),
         library_type_(library_type) {}
 
+  size_t hash_key() const { return Hash()(*this); }
+
   bool operator==(const OpKernelType& o) const {
     return platform::places_are_same_class(place_, o.place_) &&
            data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,11 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
+// Combine two hash values to a single hash.
+inline size_t CombineHash(size_t seed, size_t a) {
+  return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -794,6 +799,17 @@ void OperatorWithKernel::TransferInplaceVarsBack(
 Scope* OperatorWithKernel::TryTransferData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars) const {
+// In the inference scenerio, the scopes will be reused across the batches, so
+// the `new_scope` here will result in GPU memroy explosion over the running of
+// operators.
+// We use a thread_local cache to fix that issue, the key in the cache is the
+// combination of the `scope` argument, from_kernel_type, target_kernel_type.
+// Have a discussion with @Superjomn or the inference developers if some changes
+// on this logic for this macro might not tested on the other scenerios.
+#ifdef PADDLE_ON_INFERENCE
+  thread_local std::unordered_map<size_t, Scope*> infer_transfer_scope_cache;
+#endif
+
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
     for (auto& var_name : var_name_item.second) {
@@ -824,11 +840,28 @@ Scope* OperatorWithKernel::TryTransferData(
       VLOG(30) << "Transform Variable " << var_name << " from "
                << kernel_type_for_var << " to " << expected_kernel_key;
 
+#ifdef PADDLE_ON_INFERENCE
+      size_t infer_cache_key =
+          CombineHash(OpKernelType::Hash()(kernel_type_for_var),
+                      OpKernelType::Hash()(expected_kernel_key));
+      infer_cache_key =
+          CombineHash(infer_cache_key, std::hash<const Scope*>()(&scope));
+
+      auto it = infer_transfer_scope_cache.find(infer_cache_key);
+      if (it != infer_transfer_scope_cache.end()) {
+        new_scope = infer_transfer_scope_cache[infer_cache_key];
+      } else {
+        new_scope = &scope.NewScope();
+        infer_transfer_scope_cache[infer_cache_key] = new_scope;
+      }
+#endif
+
       if (new_scope == nullptr) {
         new_scope = &scope.NewScope();
       }
 
       auto* trans_var = new_scope->Var(var_name);
+
       Tensor out;
       TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
       SetTensorToVariable(*var, out, trans_var);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
@@ -42,7 +42,7 @@ DEFINE_double(
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
 #define SCOPE_LOCK_GUARD
 #else
 #define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);