Merge pull request #10476 from chengduoZH/refine_parallel_exe

chengduo · web-flow · commit ce72c3ff2699 · 2018-05-10T23:23:56.000+08:00
Clean Parallel exe
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -38,9 +38,7 @@ void BroadcastOpHandle::RunImpl() {
       out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // Wait input done, this Wait is asynchronous operation platform::Place
-  // &in_place;
-  WaitInputVarGenerated(*in_var_handle);
+  WaitInputVarGenerated();
 
   std::vector<const Scope *> var_scopes;
   for (auto *s : local_scopes_) {
@@ -50,29 +48,9 @@ void BroadcastOpHandle::RunImpl() {
   auto *in_var =
       var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
   PADDLE_ENFORCE_NOT_NULL(in_var);
-
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
-  // NOTE: The tensors' Place of input and output must be all on GPU or all on
-  // CPU.
-  for (auto *out_var_handle : out_var_handles) {
-    if (out_var_handle->IsTheSameVar(*in_var_handle)) {
-      continue;
-    }
-    auto t_out_p = out_var_handle->place_;
-    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
-                        ->FindVar(out_var_handle->name_);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    if (platform::is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
-    } else {
-      t_out_p = platform::CPUPlace();
-    }
-    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
-                                                            in_tensor.type());
-  }
+  InitOutputValue(*in_var_handle, out_var_handles);
 
   if (platform::is_cpu_place(in_tensor.place())) {
     for (auto *out_var_handle : out_var_handles) {
@@ -147,11 +125,37 @@ void BroadcastOpHandle::RunImpl() {
   }
 }
 
-void BroadcastOpHandle::WaitInputVarGenerated(const VarHandle &in_var) {
-  if (in_var.generated_op_) {
-    for (auto &pair : dev_ctxes_) {
-      in_var.generated_op_->Wait(pair.second);
+void BroadcastOpHandle::InitOutputValue(
+    const VarHandle &in_var_handle,
+    const std::vector<VarHandle *> &out_var_handles) const {
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+  auto *in_var =
+      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
+
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
+  // CPU.
+  for (auto *out_var_handle : out_var_handles) {
+    if (out_var_handle->IsTheSameVar(in_var_handle)) {
+      continue;
     }
+    auto t_out_p = out_var_handle->place_;
+    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                        ->FindVar(out_var_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    if (is_gpu_place(in_tensor.place())) {
+      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
+                     "Places of input and output must be all on GPU.");
+    } else {
+      t_out_p = platform::CPUPlace();
+    }
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
+                                                            in_tensor.type());
   }
 }
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,14 +57,16 @@ struct BroadcastOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-  void WaitInputVarGenerated(const VarHandle &in_var);
 
  private:
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
+
+  void InitOutputValue(const VarHandle &in_var_handle,
+                       const std::vector<VarHandle *> &out_var_handles) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -26,20 +26,20 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
       place_(place) {}
 
 void ComputationOpHandle::RunImpl() {
-  auto *cur_ctx = dev_ctxes_[place_];
-  for (auto *in : inputs_) {
-    bool need_wait = in->generated_op_ &&
-                     in->generated_op_->DeviceContext(place_) != cur_ctx;
-    if (need_wait) {
-      in->generated_op_->Wait(cur_ctx);
-    }
-  }
+  WaitInputVarGenerated(place_);
 
   this->RunAndRecordEvent([this] {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   });
 }
 
+bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
+  bool need_wait =
+      in_var && in_var->generated_op_ &&
+      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+  return need_wait;
+}
+
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
@@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+  virtual bool NeedWait(VarHandleBase *in_var);
+
  private:
   std::unique_ptr<OperatorBase> op_;
   Scope *scope_;
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -31,7 +31,7 @@ FetchOpHandle::~FetchOpHandle() {
   }
 }
 
-void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
+void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
 }
 
@@ -45,14 +45,8 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }
 
 void FetchOpHandle::RunImpl() {
-  auto cpu_ctx =
-      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-  for (auto *input : inputs_) {
-    auto *var = static_cast<VarHandle *>(input);
-    if (var->generated_op_) {
-      var->generated_op_->Wait(cpu_ctx);
-    }
-  }
+  WaitInputVarGenerated(platform::CPUPlace());
+
   tensors_.resize(inputs_.size());
   auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
   auto &var_name = var_handle->name_;
@@ -79,6 +73,15 @@ void FetchOpHandle::RunImpl() {
   this->WaitAndMergeCPUTensors();
 }
 
+void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
+  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
+  for (auto *input : inputs_) {
+    if (input->generated_op_) {
+      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    }
+  }
+}
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -33,7 +33,7 @@ struct FetchOpHandle : public OpHandleBase {
 
   ~FetchOpHandle();
 
-  void Wait(platform::DeviceContext *waited_dev) override;
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
 
   void WaitAndMergeCPUTensors() const;
 
@@ -42,6 +42,8 @@ struct FetchOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+  virtual void WaitInputVarGenerated(const platform::Place &place);
+
  private:
   FeedFetchList *data_;
   size_t offset_;
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -55,7 +55,7 @@ void GatherOpHandle::RunImpl() {
                  "Currently, gather_op only can gather SelectedRows.");
 
   // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated(in_var_handles);
+  WaitInputVarGenerated();
 
   auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
   std::vector<int64_t> out_rows;
@@ -111,17 +111,6 @@ void GatherOpHandle::RunImpl() {
   });
 }
 
-void GatherOpHandle::WaitInputVarGenerated(
-    const std::vector<VarHandle *> &in_var_handles) {
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      for (auto pair : dev_ctxes_) {
-        in->generated_op_->Wait(pair.second);
-      }
-    }
-  }
-}
-
 std::string GatherOpHandle::Name() const { return "gather"; }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
@@ -39,7 +39,6 @@ struct GatherOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
 
  private:
   const std::vector<Scope *> &local_scopes_;
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -34,12 +34,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
     return;  // No need to all reduce when GPU count = 1;
   } else {
     // Wait input done
-    for (auto *in : inputs_) {
-      auto &p = static_cast<VarHandle *>(in)->place_;
-      if (in->generated_op_) {
-        in->generated_op_->Wait(dev_ctxes_[p]);
-      }
-    }
+    WaitInputVarGenerated();
 
     auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
     int dtype = -1;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
@@ -56,15 +56,15 @@ void OpHandleBase::Run(bool use_event) {
   RunImpl();
 }
 
-void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
+void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
       dev_ctx.second->Wait();
     }
   } else {
     auto stream =
-        static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
+        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
       PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
     }
@@ -86,6 +86,28 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
   out->generated_op_ = this;
 }
 
+void OpHandleBase::WaitInputVarGenerated() {
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      for (auto &pair : dev_ctxes_) {
+        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+      }
+    }
+  }
+}
+
+void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
+  for (auto *in : inputs_) {
+    if (NeedWait(in)) {
+      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+    }
+  }
+}
+
+bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
+  return in_var && in_var->generated_op_;
+}
+
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
   if (!events_.empty()) {  // Use event
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
@@ -38,12 +38,24 @@ class OpHandleBase {
 
   void Run(bool use_event);
 
-  virtual void Wait(platform::DeviceContext *waited_dev);
+  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
 
   void AddInput(VarHandleBase *in);
 
   void AddOutput(VarHandleBase *out);
 
+  // This method adds the wait events of all the input on all the device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated();
+
+  // This method adds the wait events of all the input on the specified device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated(const platform::Place &place);
+
+  virtual bool NeedWait(VarHandleBase *in_var);
+
   // If the Op involves data transfer of multiple devices that
   // will likely block other computations.
   virtual bool IsMultiDeviceTransfer() { return false; }
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -51,7 +51,7 @@ void ReduceOpHandle::RunImpl() {
   PADDLE_ENFORCE_NOT_NULL(pre_in_var);
 
   // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated(in_var_handles);
+  WaitInputVarGenerated();
 
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   std::vector<platform::Place> in_places;  // used to get dev_ctx
@@ -80,19 +80,21 @@ void ReduceOpHandle::RunImpl() {
   }
 
   if (pre_in_var->IsType<framework::SelectedRows>()) {
-    std::vector<const SelectedRows *> in_selected_rows =
-        GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-
-    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                       out_var->GetMutable<framework::SelectedRows>());
+    this->RunAndRecordEvent([&] {
+      std::vector<const SelectedRows *> in_selected_rows =
+          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
+                         out_var->GetMutable<framework::SelectedRows>());
+    });
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
-
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
-      ReduceLoDTensor func(lod_tensors,
-                           out_var->GetMutable<framework::LoDTensor>());
-      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      this->RunAndRecordEvent([&] {
+        ReduceLoDTensor func(lod_tensors,
+                             out_var->GetMutable<framework::LoDTensor>());
+        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
@@ -157,17 +159,6 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
   return in_selected_rows;
 }
 
-void ReduceOpHandle::WaitInputVarGenerated(
-    const std::vector<VarHandle *> &in_var_handles) {
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      for (auto pair : dev_ctxes_) {
-        in->generated_op_->Wait(pair.second);
-      }
-    }
-  }
-}
-
 std::string ReduceOpHandle::Name() const { return "reduce"; }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -60,8 +60,6 @@ struct ReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
-  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
-
   template <typename T>
   std::vector<const T *> GetInputValues(
       const std::vector<VarHandle *> &in_var_handles,
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -29,6 +29,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 void ScaleLossGradOpHandle::RunImpl() {
+  // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h