Clean code & add execution strategy

reyoung · reyoung · commit e5281b3c2d14 · 2018-05-14T13:23:07.000+08:00
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ExecutionStrategy {
+  size_t num_threads_{0};
+  bool use_event_{true};
+  bool allow_op_delay_{false};
+};
+
+}  //  namespace details
+}  //  namespace framework
+}  //  namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -18,18 +18,17 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    size_t num_threads, bool use_event,
-    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
+    std::unique_ptr<SSAGraph> &&graph)
     : SSAGraphExecutor(std::move(graph)),
-      pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      use_event_(use_event),
       running_ops_(0),
-      allow_op_delay_(allow_op_delay) {}
+      strategy_(strategy) {}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -86,7 +85,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     //
     // NOTE: DelayedOps have a lower priority. It will be scheduled after all
     // ready_ops have been performed.
-    if (ready_ops.empty() && allow_op_delay_ && running_ops_ == 0) {
+    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
       run_all_ops(delayed_ops);
     } else {
       run_all_ops(ready_ops);
@@ -113,7 +112,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
             delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
@@ -191,7 +190,7 @@ void ThreadedSSAGraphExecutor::RunOp(
   auto op_run = [ready_var_q, op, this] {
     try {
       VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-      op->Run(use_event_);
+      op->Run(strategy_.use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -23,6 +23,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 
@@ -34,11 +35,10 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph,
-                           bool allow_op_delay);
+                           std::unique_ptr<SSAGraph> &&graph);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -55,10 +55,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
-  bool allow_op_delay_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
@@ -74,6 +72,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
       std::unordered_set<VarHandleBase *> *pending_vars,
       BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+
+ private:
+  ExecutionStrategy strategy_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -52,13 +52,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 }
 
 ParallelExecutor::ParallelExecutor(
-    size_t num_threads, bool use_event,
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
-    bool use_default_grad_scale, bool balance_parameter_opt_between_cards)
+    Scope *scope, const std::vector<Scope *> &local_scopes,
+    bool use_default_grad_scale, bool balance_parameter_opt_between_cards,
+    const ExecutionStrategy &exec_strategy)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
@@ -103,8 +103,7 @@ ParallelExecutor::ParallelExecutor(
   auto graph = builder.Build(main_program);
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
-      allow_op_delay));
+      exec_strategy, member_->local_scopes_, places, std::move(graph)));
 
   // Step 3. Create vars in each scope;
   for (auto *var : main_program.Block(0).AllVars()) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
@@ -17,53 +17,55 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::ExecutionStrategy;
+
 class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(size_t num_threads, bool use_event,
-                            const std::vector<platform::Place>& places,
-                            const std::unordered_set<std::string>& params,
-                            const std::unordered_set<std::string>& bcast_vars,
-                            const ProgramDesc& main_program,
-                            const std::string& loss_var_name, Scope* scope,
-                            const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay, bool use_default_grad_scale,
-                            bool balance_parameter_opt_between_cards);
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            bool use_default_grad_scale,
+                            bool balance_parameter_opt_between_cards,
+                            const ExecutionStrategy &exec_strategy);
 
   ~ParallelExecutor();
 
-  std::vector<Scope*>& GetLocalScopes();
+  std::vector<Scope *> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
    * size of local scopes.
    */
   void FeedTensorsIntoLocalScopes(
-      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
 
   void FeedAndSplitTensorIntoLocalScopes(
-      const std::unordered_map<std::string, LoDTensor>& tensors);
+      const std::unordered_map<std::string, LoDTensor> &tensors);
 
-  void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name);
+  void Run(const std::vector<std::string> &fetch_tensors,
+           const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
 
  private:
-  ParallelExecutorPrivate* member_;
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -494,22 +494,33 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("reset_profiler", platform::ResetProfiler);
 
-  py::class_<ParallelExecutor>(m, "ParallelExecutor")
-      .def("__init__",
-           [](ParallelExecutor &self, size_t num_threads, bool use_event,
-              const std::vector<platform::Place> &places,
-              const std::unordered_set<std::string> &params,
-              const std::unordered_set<std::string> &bcast_vars,
-              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, std::vector<Scope *> &local_scopes,
-              bool allow_op_delay, bool use_default_grad_scale,
-              bool balance_parameter_opt_between_cards) {
-             new (&self) ParallelExecutor(
-                 num_threads, use_event, places, params, bcast_vars,
-                 main_program, loss_var_name, scope, local_scopes,
-                 allow_op_delay, use_default_grad_scale,
-                 balance_parameter_opt_between_cards);
-           })
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
+      .def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          })
+      .def_property(
+          "use_event",
+          [](const ExecutionStrategy &self) { return self.use_event_; },
+          [](ExecutionStrategy &self, bool use_event) {
+            self.use_event_ = use_event;
+          })
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          });
+
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::unordered_set<std::string> &,
+                  const std::unordered_set<std::string> &, const ProgramDesc &,
+                  const std::string &, Scope *, std::vector<Scope *> &, bool,
+                  bool, const ExecutionStrategy &>())
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
@@ -44,42 +44,44 @@
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
+    InferenceTranspiler, memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
 import clip
 import profiler
 import unique_name
 import recordio_writer
-from parallel_executor import ParallelExecutor
+from parallel_executor import ParallelExecutor, ExecutionStrategy
 
 Tensor = LoDTensor
 
-__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
+__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
           trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
-    'io',
-    'initializer',
-    'layers',
-    'transpiler'
-    'nets',
-    'optimizer',
-    'learning_rate_decay',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'CUDAPinnedPlace',
-    'Tensor',
-    'ParamAttr',
-    'WeightNormParamAttr',
-    'DataFeeder',
-    'clip',
-    'profiler',
-    'unique_name',
-    'recordio_writer',
-    'ParallelExecutor',
-]
+              'io',
+              'initializer',
+              'layers',
+              'transpiler'
+              'nets',
+              'optimizer',
+              'learning_rate_decay',
+              'backward',
+              'regularizer',
+              'LoDTensor',
+              'CPUPlace',
+              'CUDAPlace',
+              'CUDAPinnedPlace',
+              'Tensor',
+              'ParamAttr',
+              'WeightNormParamAttr',
+              'DataFeeder',
+              'clip',
+              'profiler',
+              'unique_name',
+              'recordio_writer',
+              'ParallelExecutor',
+              'ExecutionStrategy',
+          ]
 
 
 def __bootstrap__():
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py