PaddlePaddle
diff --git a/‎paddle/fluid/framework/details/async_ssa_graph_executor.cc
Lines changed: 9 additions & 1 deletion b/‎paddle/fluid/framework/details/async_ssa_graph_executor.cc
Lines changed: 9 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/distributed/communicator.cc
Lines changed: 15 additions & 3 deletions b/‎paddle/fluid/operators/distributed/communicator.cc
Lines changed: 15 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/distributed/communicator.h
Lines changed: 16 additions & 19 deletions b/‎paddle/fluid/operators/distributed/communicator.h
Lines changed: 16 additions & 19 deletions
diff --git a/‎paddle/fluid/operators/distributed/communicator_test.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/operators/distributed/communicator_test.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/distributed/grpc/grpc_client.cc
Lines changed: 28 additions & 14 deletions b/‎paddle/fluid/operators/distributed/grpc/grpc_client.cc
Lines changed: 28 additions & 14 deletions
diff --git a/‎paddle/fluid/operators/distributed/grpc/grpc_client.h
Lines changed: 2 additions & 15 deletions b/‎paddle/fluid/operators/distributed/grpc/grpc_client.h
Lines changed: 2 additions & 15 deletions
diff --git a/‎paddle/fluid/operators/distributed/grpc/grpc_server.cc
Lines changed: 9 additions & 11 deletions b/‎paddle/fluid/operators/distributed/grpc/grpc_server.cc
Lines changed: 9 additions & 11 deletions
diff --git a/‎paddle/fluid/operators/distributed/parameter_send.cc
Lines changed: 36 additions & 16 deletions b/‎paddle/fluid/operators/distributed/parameter_send.cc
Lines changed: 36 additions & 16 deletions
diff --git a/‎paddle/fluid/operators/distributed/request_handler.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/distributed/request_handler.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/distributed/request_handler_impl.cc
Lines changed: 16 additions & 2 deletions b/‎paddle/fluid/operators/distributed/request_handler_impl.cc
Lines changed: 16 additions & 2 deletions
@@ -62,8 +62,16 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
             node->Op()->GetNullableAttr("sections"));
         auto trainer_id =
             boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
+        auto merge_add =
+            boost::get<bool>(node->Op()->GetNullableAttr("merge_add"));
+        if (!merge_add) {
+          merge_add = FLAGS_communicator_is_sgd_optimizer;
+        }
+        auto use_send_handler =
+            boost::get<bool>(node->Op()->GetNullableAttr("use_send_handler"));
         send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-            send_var_name, send_varnames, epmap, height_section, trainer_id);
+            send_var_name, send_varnames, epmap, height_section, trainer_id,
+            merge_add, use_send_handler);
         VLOG(3) << "find and init an send op: "
                 << send_varname_to_ctx[send_var_name];
       } else if (node->Name() == "recv") {
 
@@ -130,8 +130,15 @@ void AsyncCommunicator::InitImpl(const paddle::framework::ProgramDesc &program,
       auto height_section =
           boost::get<std::vector<int64_t>>(op->GetNullableAttr("sections"));
       auto trainer_id = boost::get<int>(op->GetNullableAttr("trainer_id"));
+      auto merge_add = boost::get<bool>(op->GetNullableAttr("merge_add"));
+      if (!merge_add) {
+        merge_add = FLAGS_communicator_is_sgd_optimizer;
+      }
+      auto use_send_handler =
+          boost::get<bool>(op->GetNullableAttr("use_send_handler"));
       send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-          send_var_name, send_varnames, epmap, height_section, trainer_id);
+          send_var_name, send_varnames, epmap, height_section, trainer_id,
+          merge_add, use_send_handler);
       VLOG(3) << "find and init an send op: "
               << send_varname_to_ctx[send_var_name];
     } else if (op->Type() == "recv") {
@@ -208,12 +215,17 @@ void AsyncCommunicator::SendThread() {
             }
           }
           auto before_merge = GetCurrentUS();
-          MergeVars(var_name, vars, send_scope_.get());
+          auto &ctx = send_varname_to_ctx_.at(var_name);
+          if (ctx.use_send_handler) {
+            MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
+          } else {
+            MergeVars<int64_t>(var_name, vars, send_scope_.get(),
+                               ctx.merge_add);
+          }
           auto after_merge = GetCurrentUS();
           VLOG(3) << "merge " << merged_var_num << " " << var_name
                   << " use time " << after_merge - before_merge;
           auto send_functor = distributed::ParameterSend<float>();
-          auto &ctx = send_varname_to_ctx_.at(var_name);
           if (!FLAGS_communicator_fake_rpc) {
             send_functor(ctx, *send_scope_, true, 1);
           }
 
@@ -107,21 +107,21 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename T>
 inline void MergeVars(const std::string& var_name,
                       const std::vector<std::shared_ptr<Variable>>& vars,
-                      Scope* scope) {
+                      Scope* scope, bool merge_add = true) {
   PADDLE_ENFORCE(!vars.empty(), "should have value to merge!");
   auto cpu_place = platform::CPUPlace();
   auto& var0 = vars[0];
   auto* out_var = scope->Var(var_name);
   if (var0->IsType<framework::LoDTensor>()) {
     auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
-
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims
+            << "; merge add: " << merge_add;
     // init output tensor
     auto* out_t = out_var->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<float>(dims, cpu_place);
-
+    out_t->mutable_data<T>(dims, cpu_place);
     // check the input dims
     for (auto& var : vars) {
       auto& var_t = var->Get<framework::LoDTensor>();
@@ -130,44 +130,41 @@ inline void MergeVars(const std::string& var_name,
 
     // set output tensor to 0.
     auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, float>
-        constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<float>(0));
-
+    math::SetConstant<paddle::platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
     // sum all vars to out
-    auto result = EigenVector<float>::Flatten(*out_t);
+    auto result = EigenVector<T>::Flatten(*out_t);
     for (auto& var : vars) {
       auto& in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<float>::Flatten(in_t);
+      auto in = EigenVector<T>::Flatten(in_t);
       result.device(*cpu_ctx.eigen_device()) = result + in;
     }
-    if (!FLAGS_communicator_is_sgd_optimizer) {
+    if (!merge_add) {
       result.device(*cpu_ctx.eigen_device()) =
-          result / static_cast<float>(vars.size());
+          result / static_cast<T>(vars.size());
     }
   } else if (var0->IsType<framework::SelectedRows>()) {
     auto& slr0 = var0->Get<framework::SelectedRows>();
     auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
     out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<float>({{}}, cpu_place);
+    out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
     std::vector<const paddle::framework::SelectedRows*> inputs;
     inputs.reserve(vars.size());
     for (auto& var : vars) {
       inputs.push_back(&var->Get<framework::SelectedRows>());
     }
     auto dev_ctx = paddle::platform::CPUDeviceContext();
-    if (FLAGS_communicator_is_sgd_optimizer) {
-      math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, float>
-          merge_add;
+    if (merge_add) {
+      math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, T> merge_add;
       merge_add(dev_ctx, inputs, out_slr);
     } else {
-      math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, float>
+      math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, T>
           merge_average;
       merge_average(dev_ctx, inputs, out_slr);
     }
 
     VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims();
+            << " dims: " << slr0.value().dims() << "; merge add: " << merge_add;
   } else {
     PADDLE_THROW("unsupported var type!");
   }
 
@@ -47,7 +47,7 @@ TEST(communicator, merge_lod_tensors) {
   scope.reset(new framework::Scope());
   scope->Var(out_name);
   for (auto i = 0; i < 10; ++i) {
-    MergeVars(out_name, in_vars, scope.get());
+    MergeVars<float>(out_name, in_vars, scope.get());
   }
   auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
   auto *out_data = out_tensor.data<float>();
@@ -86,7 +86,7 @@ TEST(communicator, merge_selected_rows) {
   scope.reset(new framework::Scope());
   scope->Var(out_name);
   for (auto i = 0; i < 10; ++i) {
-    MergeVars(out_name, in_vars, scope.get());
+    MergeVars<float>(out_name, in_vars, scope.get());
   }
   auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
   auto &out_t = out_slr.value();
 
@@ -438,26 +438,40 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
   return h;
 }
 
-VarHandlePtr GRPCClient::AsyncDistributeNotify(const std::string& ep,
-                                               const std::string& type,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  DistributeNotifyProcessor* s = new DistributeNotifyProcessor(ch);
-
+VarHandlePtr GRPCClient::AsyncDistributeNotify(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
   const std::string method = kRequestNotify;
 
-  VarHandlePtr h(
-      new VarHandle(ep, method, LEARNING_RATE_DECAY_MESSAGE, nullptr, nullptr));
+  SendProcessor* s = new SendProcessor(ch);
+  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  sendrecv::VariableMessage req;
-  req.set_varname(type);
+  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
+    auto* var = p_scope->FindVar(var_name_val);
 
-  platform::RecordRPCEvent record_event(method);
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
 
-  auto rpc = s->stub_->AsyncDistributeNotify(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+
+    // stub context
+    s->response_call_back_ = nullptr;
+
+    platform::RecordRPCEvent record_event(method);
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req,
+        &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  });
   req_count_++;
 
   if (UNLIKELY(platform::IsProfileEnabled())) {
 
@@ -173,20 +173,6 @@ class CheckpointNotifyProcessor : public BaseProcessor {
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
-class DistributeNotifyProcessor : public BaseProcessor {
- public:
-  explicit DistributeNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~DistributeNotifyProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
 class GRPCClient : public RPCClient {
  public:
   GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
@@ -240,7 +226,8 @@ class GRPCClient : public RPCClient {
       int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const std::string& type,
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncSendComplete(
 
@@ -400,33 +400,31 @@ class RequestNotify final : public RequestBase {
                          RequestHandler* request_handler, int req_id)
       : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
     request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx()));
+                                            request_handler->dev_ctx(),
+                                            !request_handler->sync_mode()));
     int method_id = static_cast<int>(distributed::GrpcMethod::kRequestNotify);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
         reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
-
   virtual ~RequestNotify() {}
-
   std::string GetReqName() override { return request_->Varname(); }
 
   void Process() override {
-    auto scope = request_->GetMutableLocalScope();
+    std::string varname = GetReqName();
+    VLOG(4) << "RequestNotify var_name:" << varname;
 
-    std::string varname = request_->Varname();
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
     int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestNotify notify: " << varname
-            << ", trainer id: " << trainer_id;
-
-    request_handler_->Handle(varname, scope, nullptr, nullptr, trainer_id);
+    framework::Variable* outvar = nullptr;
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
     Finish(reply_, &responder_);
   }
 
  protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
   sendrecv::VoidMessage reply_;
+  std::shared_ptr<GRPCVariableResponse> request_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
 
@@ -116,24 +116,44 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
         row_offset += outs_dims[i][0];
       }
     }
-
-    for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
-      auto &send_var_name = rpc_ctx.splited_var_names[i];
-      VLOG(4) << "send var name: " << send_var_name;
-      auto &endpoint = rpc_ctx.epmap[i];
-      VLOG(4) << "send var endpoint: " << endpoint;
-      VLOG(4) << "need send: " << NeedSend(*local_scope.get(), send_var_name);
-      if (NeedSend(*local_scope.get(), send_var_name)) {
-        VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(3) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_var_names[i];
+    if (rpc_ctx.use_send_handler) {
+      for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
+        auto &send_var_name = rpc_ctx.splited_var_names[i];
+        VLOG(4) << "send var name: " << send_var_name;
+        auto &endpoint = rpc_ctx.epmap[i];
+        VLOG(4) << "send var endpoint: " << endpoint;
+        VLOG(4) << "need send: " << NeedSend(*local_scope.get(), send_var_name);
+        if (NeedSend(*local_scope.get(), send_var_name)) {
+          VLOG(3) << "sending " << send_var_name << " to " << endpoint;
+          rets.push_back(rpc_client->AsyncSendVar(
+              endpoint, cpu_ctx, *local_scope.get(), send_var_name));
+          VLOG(4) << "send var " << send_var_name << " async handle done";
+        } else {
+          VLOG(3) << "don't send non-initialized variable: "
+                  << rpc_ctx.splited_var_names[i];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
+        for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) {
+          auto &send_var_name = rpc_ctx.splited_var_names[i];
+          VLOG(4) << "send var name: " << send_var_name;
+          auto &endpoint = rpc_ctx.epmap[j];
+          VLOG(4) << "send var endpoint: " << endpoint;
+          VLOG(4) << "need send: "
+                  << NeedSend(*local_scope.get(), send_var_name);
+          if (NeedSend(*local_scope.get(), send_var_name)) {
+            VLOG(3) << "sending " << send_var_name << " to " << endpoint;
+            rets.push_back(rpc_client->AsyncDistributeNotify(
+                endpoint, cpu_ctx, *local_scope.get(), send_var_name));
+            VLOG(4) << "send var " << send_var_name << " async handle done";
+          } else {
+            VLOG(3) << "don't send non-initialized variable: "
+                    << rpc_ctx.splited_var_names[i];
+          }
+        }
       }
     }
-
   } else if (send_var->IsType<framework::SelectedRows>()) {
     auto &send_slr = send_var->Get<framework::SelectedRows>();
     auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections);
 
@@ -63,7 +63,7 @@ constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
 #define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
-#define LEARNING_RATE_DECAY_MESSAGE "LRDECAY@RECV"
+#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
 
@@ -262,11 +262,25 @@ bool RequestNotifyHandler::Handle(const std::string& varname,
                                   const int trainer_id,
                                   const std::string& out_var_name,
                                   const std::string& table_name) {
-  VLOG(4) << "RequestNotifyHandler" << varname;
-  if (varname == LEARNING_RATE_DECAY_MESSAGE) {
+  VLOG(4) << "RequestNotifyHandler: " << varname;
+  VLOG(3) << "async process var: " << varname << ", trainer_id: " << trainer_id;
+
+  string::Piece decay_piece(LEARNING_RATE_DECAY_COUNTER);
+  string::Piece var_name_piece = string::Piece(varname);
+  if (string::Contains(var_name_piece, decay_piece)) {
+    VLOG(3) << "LearningRate Decay Counter Update";
     PADDLE_ENFORCE_NE(
         lr_decay_block_id, -1,
         "when lr_decay_block_id = -1, there should be no RPC invoke.");
+    auto* origin_var = scope_->FindVar(varname);
+    auto origin_var_tensor = origin_var->Get<framework::LoDTensor>();
+    auto* send_var = scope->FindVar(varname);
+    auto send_var_tensor = send_var->Get<framework::LoDTensor>();
+    int64_t* origin_value =
+        origin_var_tensor.mutable_data<int64_t>(origin_var_tensor.place());
+    int64_t* send_value =
+        send_var_tensor.mutable_data<int64_t>(send_var_tensor.place());
+    origin_value[0] += send_value[0];
     executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_);
   }
   return true;