PaddlePaddle
diff --git a/‎paddle/fluid/framework/details/execution_strategy.h
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/framework/details/execution_strategy.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/parallel_executor.cc
Lines changed: 12 additions & 11 deletions b/‎paddle/fluid/framework/parallel_executor.cc
Lines changed: 12 additions & 11 deletions
diff --git a/‎paddle/fluid/framework/threadpool.cc
Lines changed: 18 additions & 12 deletions b/‎paddle/fluid/framework/threadpool.cc
Lines changed: 18 additions & 12 deletions
diff --git a/‎paddle/fluid/framework/threadpool.h
Lines changed: 8 additions & 3 deletions b/‎paddle/fluid/framework/threadpool.h
Lines changed: 8 additions & 3 deletions
diff --git a/‎paddle/fluid/inference/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/inference/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
Lines changed: 20 additions & 1 deletion b/‎paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
Lines changed: 20 additions & 1 deletion
diff --git a/‎paddle/fluid/inference/tensorrt/engine.cc
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/inference/tensorrt/engine.cc
Lines changed: 4 additions & 0 deletions
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <cstddef>  // for size_t
 
 namespace paddle {
 namespace framework {
@@ -26,6 +27,7 @@ struct ExecutionStrategy {
   bool allow_op_delay_{false};
   size_t num_iteration_per_drop_scope_{100};
   ExecutorType type_{kDefault};
+  bool dry_run_{false};
 };
 
 }  //  namespace details
 
@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     size_t complete = 0;
     while (op_to_run != nullptr) {
       try {
-        op_to_run->Run(strategy_.use_cuda_);
+        if (LIKELY(!strategy_.dry_run_)) {
+          op_to_run->Run(strategy_.use_cuda_);
+        }
         ++complete;
       } catch (...) {
         exception_.Catch(std::current_exception());
 
@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       if (VLOG_IS_ON(10)) {
         VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       }
-      op->Run(strategy_.use_cuda_);
+      if (LIKELY(!strategy_.dry_run_)) {
+        op->Run(strategy_.use_cuda_);
+      }
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
 
@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
-  ~ThreadedSSAGraphExecutor() {}
+  ~ThreadedSSAGraphExecutor() final = default;
 
  private:
   void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
 
@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
       : places_(places) {}
 
+  ~ParallelExecutorPrivate() {
+    if (own_local_scope_) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
+        // Skip the first scope, since it is the global scope.
+        Scope *local_scope = local_scopes_[i];
+        if (global_scope_->HasKid(local_scope)) {
+          global_scope_->DeleteScope(local_scope);
+        }
+      }
+    }
+  }
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  Scope *global_scope_;  // not owned
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-
-  if (member_->own_local_scope_) {
-    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      Scope *local_scope = member_->local_scopes_[i];
-      if (member_->global_scope_->HasKid(local_scope)) {
-        member_->global_scope_->DeleteScope(local_scope);
-      }
-    }
-  }
-
   // member_ must be destructed before gcs_ since the destructor of
   // ReferenceCountOpHandle use raw pointers of gcs_ inside.
   member_.reset();
 
@@ -57,10 +57,10 @@ ThreadPool::ThreadPool(int num_threads) : running_(true) {
 ThreadPool::~ThreadPool() {
   {
     // notify all threads to stop running
-    std::lock_guard<std::mutex> l(mutex_);
+    std::unique_lock<std::mutex> l(mutex_);
     running_ = false;
-    scheduled_.notify_all();
   }
+  scheduled_.notify_all();
 
   for (auto& t : threads_) {
     t->join();
@@ -70,19 +70,25 @@ ThreadPool::~ThreadPool() {
 
 void ThreadPool::TaskLoop() {
   while (true) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    Task task;
 
-    scheduled_.wait(
-        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      scheduled_.wait(
+          lock, [this] { return !this->tasks_.empty() || !this->running_; });
 
-    if (!running_ || tasks_.empty()) {
-      return;
-    }
+      if (!running_ && tasks_.empty()) {
+        return;
+      }
+
+      if (tasks_.empty()) {
+        PADDLE_THROW("This thread has no task to Run");
+      }
 
-    // pop a task from the task queue
-    auto task = std::move(tasks_.front());
-    tasks_.pop();
-    lock.unlock();
+      // pop a task from the task queue
+      task = std::move(tasks_.front());
+      tasks_.pop();
+    }
 
     // run the task
     task();
 
@@ -58,7 +58,7 @@ class ThreadPool {
   ~ThreadPool();
 
   // Run pushes a function to the task queue and returns a std::future
-  // object.  To wait for the completion of the task, call
+  // object. To wait for the completion of the task, call
   // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
@@ -69,7 +69,6 @@ class ThreadPool {
   template <typename Callback>
   std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
       Callback fn) {
-    std::unique_lock<std::mutex> lock(mutex_);
     Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
       try {
         fn();
@@ -84,7 +83,13 @@ class ThreadPool {
       return nullptr;
     });
     std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
-    tasks_.push(std::move(task));
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      if (!running_) {
+        PADDLE_THROW("enqueue on stopped ThreadPool");
+      }
+      tasks_.push(std::move(task));
+    }
     scheduled_.notify_one();
     return f;
   }
 
@@ -1,5 +1,5 @@
 if(WITH_TESTING)
-  include(test.cmake) # some generic cmake funtion for inference
+  include(tests/test.cmake) # some generic cmake funtion for inference
 endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 
@@ -18,6 +18,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+bool to_skip_merging_optimize(TensorRTEngine* engine_,
+                              const std::vector<int>& filters,
+                              const std::vector<int>& strides,
+                              const std::vector<int>& paddings,
+                              std::string input_name) {
+  if (engine_->itensor_quote_num[input_name] > 0) {
+    return true;
+  }
+  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
+      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
+    engine_->itensor_quote_num[input_name] += 1;
+
+  return false;
+}
+
 class Conv2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
 
     auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+
     // Declare weights
     auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
         std::move(weight_tensor);
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
+
+    if (test_mode ||
+        to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
+                                 paddings, op_desc.Input("Input").front())) {
       engine_->DeclareOutput(output_name);
     }
   }
 
@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
   buffer_sizes_[name] = 0;
 }
 
+bool TensorRTEngine::HasDeclared(const std::string &name) {
+  return buffer_sizes_.count(name) > 0;
+}
+
 void TensorRTEngine::DeclareOutput(const std::string &name) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                     name);