PaddlePaddle
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_pass.cc
Lines changed: 4 additions & 38 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_pass.cc
Lines changed: 4 additions & 38 deletions
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_pass.h
Lines changed: 0 additions & 6 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_pass.h
Lines changed: 0 additions & 6 deletions
diff --git a/‎paddle/fluid/framework/op_proto_maker.cc
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/framework/op_proto_maker.cc
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/framework/op_proto_maker.h
Lines changed: 6 additions & 0 deletions b/‎paddle/fluid/framework/op_proto_maker.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/distributed/variable_response.cc
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/operators/distributed/variable_response.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/math/selected_rows_functor.cu
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/operators/math/selected_rows_functor.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/math/selected_rows_functor_test.cu
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/operators/math/selected_rows_functor_test.cu
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/fluid/pybind/const_value.cc
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/pybind/const_value.cc
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/paddle/fluid/framework.py
Lines changed: 24 additions & 0 deletions b/‎python/paddle/fluid/framework.py
Lines changed: 24 additions & 0 deletions
@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
   return recv_vars;
 }
 
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    ir::Node *node, const std::vector<std::string> &send_vars,
-    const std::vector<std::string> &recv_vars) const {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
-  }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
-}
-
 size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
         }
       }
       is_dist_train = true;
-    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+    } else if (boost::get<int>(node->Op()->GetAttr(
+                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+               static_cast<int>(OpRole::kDist)) {
       int op_dev_id = CreateDistTrainOp(&result, node);
       if (node->Op()->Type() == "concat") {
         auto origin_param_name = node->Op()->OutputArgumentNames()[0];
@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
           .emplace(varname, op_dev_id);
     }
   } else {
+    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
     PADDLE_THROW(
         "the distribute training related op should be in [split_byref, "
         "concat].");
 
@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
   int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
-  /**
-   * Is this operator as the end-point operator before/after send operator.
-   */
-  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                     const std::vector<std::string> &recv_vars) const;
-
   std::vector<std::string> FindDistTrainSendVars(
       const std::vector<ir::Node *> &nodes) const;
 
 
@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           {static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kBackward),
            static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
            static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kLoss) |
                static_cast<int>(OpRole::kBackward),
 
@@ -26,7 +26,13 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
+  // RPC role is for send/recv releated op
   kRPC = 0x0003,
+  // Dist role is for split_byref/split_selected_rows/concat
+  // used for distributed training.
+  kDist = 0x0004,
+  // Tag all learning rate scheduler operators.
+  kLRSched = 0x0005,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
 
@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
+  auto server_var = GetVar();
+  if (!server_var) {
+    LOG(ERROR) << "recved var should not on current server: "
+               << meta_.varname();
+    return false;
+  }
   auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
   tensor->Resize(dims);
-
   framework::LoD lod;
   for (int i = 0; i < meta_.lod_level(); ++i) {
     framework::Vector<size_t> v;
@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(
 
   void* tensor_data =
       tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
-
   if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
     return false;
   }
 
@@ -107,7 +107,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
 
     auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@@ -206,7 +206,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
 
@@ -20,7 +20,9 @@ limitations under the License. */
 TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext ctx(gpu_place);
+  paddle::platform::CUDADeviceContext& ctx =
+      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
   paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
                                        float>
       functor;
@@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) {
 TEST(selected_rows_functor, gpu_add_to) {
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext ctx(gpu_place);
+  paddle::platform::CUDADeviceContext& ctx =
+      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
   paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
                                        float>
       functor;
 
@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
       .value("Backward", framework::OpRole::kBackward)
       .value("Optimize", framework::OpRole::kOptimize)
       .value("Loss", framework::OpRole::kLoss)
-      .value("RPC", framework::OpRole::kRPC);
+      .value("RPC", framework::OpRole::kRPC)
+      .value("Dist", framework::OpRole::kDist)
+      .value("LRSched", framework::OpRole::kLRSched);
 
   op_proto_and_checker_maker.def(
       "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
 
@@ -1509,6 +1509,30 @@ def _optimized_guard(self, param_and_grads):
         self._op_role_var = []
         self._current_role = OpRole.Forward
 
+    @contextlib.contextmanager
+    def _lr_schedule_guard(self):
+        """
+        A with guard to set :code:`LRSched` :code:`OpRole` and
+        :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
+        set to the target learning rate.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.lr_schedule_guard():
+            >>>     lr = lr * decay
+        """
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.LRSched
+        # TODO(typhoonzero): how to set target learning rate var
+        self._op_role_var = []
+        yield
+        self._op_role_var = []
+        self._current_role = OpRole.Forward
+
     def __str__(self):
         """
         Get the protobuf debug string of this Program.