PaddlePaddle
diff --git a/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 22 additions & 6 deletions b/‎benchmark/cluster/vgg16/vgg16_fluid.py
Lines changed: 22 additions & 6 deletions
diff --git a/‎doc/fluid/design/dist_train/distributed_traing_review.md
Lines changed: 0 additions & 4 deletions b/‎doc/fluid/design/dist_train/distributed_traing_review.md
Lines changed: 0 additions & 4 deletions
diff --git a/‎paddle/fluid/framework/block_desc.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/block_desc.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/details/CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/framework/details/CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/details/broadcast_op_handle.cc
Lines changed: 87 additions & 19 deletions b/‎paddle/fluid/framework/details/broadcast_op_handle.cc
Lines changed: 87 additions & 19 deletions
diff --git a/‎paddle/fluid/framework/details/broadcast_op_handle.h
Lines changed: 22 additions & 1 deletion b/‎paddle/fluid/framework/details/broadcast_op_handle.h
Lines changed: 22 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/details/broadcast_op_handle_test.cc
Lines changed: 33 additions & 3 deletions b/‎paddle/fluid/framework/details/broadcast_op_handle_test.cc
Lines changed: 33 additions & 3 deletions
@@ -80,6 +80,8 @@ def str2bool(v):
     type=str,
     default="",
     help="Comma-separated list of hostname:port pairs")
+parser.add_argument(
+    "--profile", action='store_true', help="If set, profile a few steps.")
 
 # Flags for defining the tf.train.Server
 parser.add_argument(
@@ -183,8 +185,8 @@ def train_loop(exe, trainer_prog):
             start_time = time.time()
             num_samples = 0
             train_pass_acc.reset()
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
+
+            def run_step(batch_id, data):
                 img_data = np.array(
                     map(lambda x: x[0].reshape(data_shape), data)).astype(
                         "float32")
@@ -196,14 +198,28 @@ def train_loop(exe, trainer_prog):
                     feed={"pixel": img_data,
                           "label": y_data},
                     fetch_list=[avg_cost, batch_acc, batch_size])
+                return loss, acc, b_size
+
+            if args.profile and args.task_index == 0:
+                # warmup.
+                for batch_id, data in enumerate(train_reader()):
+                    if batch_id > 5: break
+                    run_step(batch_id, data)
+                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
+                    for batch_id, data in enumerate(train_reader()):
+                        if batch_id > 5: break
+                        run_step(batch_id, data)
+
+            for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
+                loss, acc, b_size = run_step(batch_id, data)
                 iters += 1
                 num_samples += len(data)
                 train_pass_acc.add(value=acc, weight=b_size)
                 print(
-                    "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s " % (args.task_index, pass_id, iters,
-                                             loss, acc,
-                                             len(data) / (time.time() - ts))
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
+                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
+                                            len(data) / (time.time() - ts))
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
 
@@ -42,7 +42,3 @@ Codistillation is a technique that tries to scale the training further. A few tr
 [3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
 
 [4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
-
-
-
-
@@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) {
 }
 
 void BlockDesc::RemoveOp(size_t s, size_t e) {
-  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+  if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) {
     return;
   }
   need_update_ = true;
 
@@ -15,12 +15,14 @@ if(WITH_GPU)
             dynload_cuda)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+
 else()
     set(multi_devices_graph_builder_deps)
     cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
-cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
 
@@ -19,14 +19,12 @@
 namespace paddle {
 namespace framework {
 namespace details {
-BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                                     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
 
 void BroadcastOpHandle::RunImpl() {
-  // the input and output may have dummy var.
-  VarHandle *in_var_handle;
+  if (places_.size() == 1) return;
 
+  // The input and output may have dummy vars.
+  VarHandle *in_var_handle;
   {
     auto in_var_handles = DynamicCast<VarHandle>(inputs_);
     PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
@@ -55,27 +53,97 @@ void BroadcastOpHandle::RunImpl() {
 
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
-  for (auto *out : out_var_handles) {
-    if (*out == *in_var_handle) {
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
+  // CPU.
+  for (auto *out_var_handle : out_var_handles) {
+    if (out_var_handle->IsTheSameVar(*in_var_handle)) {
       continue;
     }
-
-    auto &out_p = out->place_;
-    auto *out_var = var_scopes.at(out->scope_idx_)->FindVar(out->name_);
+    auto t_out_p = out_var_handle->place_;
+    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                        ->FindVar(out_var_handle->name_);
     PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_EQ(out_p.which(), in_var_handle->place_.which(),
-                      "Places must be all on CPU or all on CUDA.");
-
+    if (platform::is_gpu_place(in_tensor.place())) {
+      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
+                     "Places of input and output must be all on GPU.");
+    } else {
+      t_out_p = platform::CPUPlace();
+    }
     VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-    VariableVisitor::GetMutableTensor(out_var).mutable_data(out_p,
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
                                                             in_tensor.type());
+  }
+
+  if (platform::is_cpu_place(in_tensor.place())) {
+    for (auto *out_var_handle : out_var_handles) {
+      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
+        continue;
+      }
+      auto &out_p = out_var_handle->place_;
+      auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                          ->FindVar(out_var_handle->name_);
+
+      RunAndRecordEvent(out_p, [in_tensor, out_var] {
+        paddle::framework::TensorCopy(
+            in_tensor, platform::CPUPlace(),
+            &VariableVisitor::GetMutableTensor(out_var));
+      });
+    }
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    VarHandle *out_handle = nullptr;
+    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
+    std::vector<std::function<void()>> broadcast_calls;
+
+    for (auto out_var_handle : out_var_handles) {
+      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                              ->FindVar(out_var_handle->name_);
+
+      int dst_id =
+          boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
+
+      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
+
+      void *send_recv_buffer = nullptr;
+      if (root_id == dst_id) {
+        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        out_handle = out_var_handle;
+      } else {
+        send_recv_buffer =
+            VariableVisitor::GetMutableTensor(out_var).mutable_data(
+                out_var_handle->place_);
+      }
+
+      int type = platform::ToNCCLDataType(in_tensor.type());
+      size_t numel = static_cast<size_t>(in_tensor.numel());
+      broadcast_calls.emplace_back(
+          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
+            PADDLE_ENFORCE(platform::dynload::ncclBcast(
+                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
+                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
+          });
+    }
 
-    auto dev_ctx = dev_ctxes_.at(out_p);
-    RunAndRecordEvent(out_p, [in_tensor, out_var, dev_ctx, out_p] {
-      paddle::framework::TensorCopy(
-          in_tensor, out_p, *(dev_ctx),
-          &VariableVisitor::GetMutableTensor(out_var));
+    this->RunAndRecordEvent([&] {
+      {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : broadcast_calls) {
+          call();
+        }
+      }
+
+      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                           ->FindVar(out_var_handles[0]->name_);
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle->place_,
+            *(dev_ctxes_.at(in_var_handle->place_)),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
     });
+#else
+    PADDLE_THROW("CUDA is not enabled.");
+#endif
   }
 }
 
 
@@ -24,14 +24,32 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
+#ifdef PADDLE_WITH_CUDA
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
   BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
-                    const std::vector<platform::Place> &places);
+                    const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
 
   std::string Name() const override;
 
@@ -44,6 +62,9 @@ struct BroadcastOpHandle : public OpHandleBase {
  private:
   const std::vector<Scope *> &local_scopes_;
   const std::vector<platform::Place> &places_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
 };
 }  // namespace details
 }  // namespace framework
 
@@ -35,15 +35,25 @@ struct TestBroadcastOpHandle {
   std::unique_ptr<OpHandleBase> op_handle_;
   std::vector<std::unique_ptr<VarHandleBase>> vars_;
   std::vector<p::Place> gpu_list_;
+  bool use_gpu_;
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
 
   void WaitAll() {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
   }
 
   void InitCtxOnGpu(bool use_gpu) {
-    if (use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
@@ -57,6 +67,7 @@ struct TestBroadcastOpHandle {
         gpu_list_.push_back(p);
         ctxs_.emplace_back(new p::CUDADeviceContext(p));
       }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
@@ -67,6 +78,9 @@ struct TestBroadcastOpHandle {
         gpu_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
     }
   }
 
@@ -82,7 +96,21 @@ struct TestBroadcastOpHandle {
     }
     param_scopes_[input_scope_idx]->Var("input");
 
-    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
 
     auto* in_var_handle =
         new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
@@ -97,7 +125,9 @@ struct TestBroadcastOpHandle {
     op_handle_->AddInput(dummy_var_handle);
 
     for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      }
       VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
       vars_.emplace_back(out_var_handle);
       op_handle_->AddOutput(out_var_handle);
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ OpDesc *BlockDesc::InsertOp(size_t index) {`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`void BlockDesc::RemoveOp(size_t s, size_t e) {`
`146`		`- if (ops_.begin() + s == ops_.end() \|\| ops_.begin() + e == ops_.end()) {`
	`146`	`+ if (ops_.begin() + s >= ops_.end() \|\| ops_.begin() + e > ops_.end()) {`
`147`	`147`	`return;`
`148`	`148`	`}`
`149`	`149`	`need_update_ = true;`