PaddlePaddle
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_pass.cc
Lines changed: 4 additions & 38 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_pass.cc
Lines changed: 4 additions & 38 deletions
diff --git a/‎paddle/fluid/framework/details/multi_devices_graph_pass.h
Lines changed: 0 additions & 6 deletions b/‎paddle/fluid/framework/details/multi_devices_graph_pass.h
Lines changed: 0 additions & 6 deletions
diff --git a/‎paddle/fluid/framework/op_proto_maker.cc
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/framework/op_proto_maker.cc
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/framework/op_proto_maker.h
Lines changed: 6 additions & 0 deletions b/‎paddle/fluid/framework/op_proto_maker.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/distributed/variable_response.cc
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/operators/distributed/variable_response.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/fluid/pybind/const_value.cc
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/pybind/const_value.cc
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/paddle/fluid/framework.py
Lines changed: 24 additions & 0 deletions b/‎python/paddle/fluid/framework.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎python/paddle/fluid/layers/learning_rate_scheduler.py
Lines changed: 73 additions & 64 deletions b/‎python/paddle/fluid/layers/learning_rate_scheduler.py
Lines changed: 73 additions & 64 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/test_dist_mnist.py
Lines changed: 3 additions & 3 deletions b/‎python/paddle/fluid/tests/unittests/test_dist_mnist.py
Lines changed: 3 additions & 3 deletions
@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
   return recv_vars;
 }
 
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    ir::Node *node, const std::vector<std::string> &send_vars,
-    const std::vector<std::string> &recv_vars) const {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
-  }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
-}
-
 size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
         }
       }
       is_dist_train = true;
-    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+    } else if (boost::get<int>(node->Op()->GetAttr(
+                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+               static_cast<int>(OpRole::kDist)) {
       int op_dev_id = CreateDistTrainOp(&result, node);
       if (node->Op()->Type() == "concat") {
         auto origin_param_name = node->Op()->OutputArgumentNames()[0];
@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
           .emplace(varname, op_dev_id);
     }
   } else {
+    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
     PADDLE_THROW(
         "the distribute training related op should be in [split_byref, "
         "concat].");
 
@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
   int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
-  /**
-   * Is this operator as the end-point operator before/after send operator.
-   */
-  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                     const std::vector<std::string> &recv_vars) const;
-
   std::vector<std::string> FindDistTrainSendVars(
       const std::vector<ir::Node *> &nodes) const;
 
 
@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
           {static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kBackward),
            static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
            static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kLoss) |
                static_cast<int>(OpRole::kBackward),
 
@@ -26,7 +26,13 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
+  // RPC role is for send/recv releated op
   kRPC = 0x0003,
+  // Dist role is for split_byref/split_selected_rows/concat
+  // used for distributed training.
+  kDist = 0x0004,
+  // Tag all learning rate scheduler operators.
+  kLRSched = 0x0005,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
 
@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
     ::google::protobuf::io::CodedInputStream* input,
     const platform::DeviceContext& ctx, const framework::DDim& dims,
     int length) {
+  auto server_var = GetVar();
+  if (!server_var) {
+    LOG(ERROR) << "recved var should not on current server: "
+               << meta_.varname();
+    return false;
+  }
   auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
   tensor->Resize(dims);
-
   framework::LoD lod;
   for (int i = 0; i < meta_.lod_level(); ++i) {
     framework::Vector<size_t> v;
@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(
 
   void* tensor_data =
       tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
-
   if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
     return false;
   }
 
@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
       .value("Backward", framework::OpRole::kBackward)
       .value("Optimize", framework::OpRole::kOptimize)
       .value("Loss", framework::OpRole::kLoss)
-      .value("RPC", framework::OpRole::kRPC);
+      .value("RPC", framework::OpRole::kRPC)
+      .value("Dist", framework::OpRole::kDist)
+      .value("LRSched", framework::OpRole::kLRSched);
 
   op_proto_and_checker_maker.def(
       "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
 
@@ -1509,6 +1509,30 @@ def _optimized_guard(self, param_and_grads):
         self._op_role_var = []
         self._current_role = OpRole.Forward
 
+    @contextlib.contextmanager
+    def _lr_schedule_guard(self):
+        """
+        A with guard to set :code:`LRSched` :code:`OpRole` and
+        :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
+        set to the target learning rate.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.lr_schedule_guard():
+            >>>     lr = lr * decay
+        """
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.LRSched
+        # TODO(typhoonzero): how to set target learning rate var
+        self._op_role_var = []
+        yield
+        self._op_role_var = []
+        self._current_role = OpRole.Forward
+
     def __str__(self):
         """
         Get the protobuf debug string of this Program.
 
@@ -27,7 +27,7 @@
 from . import ops
 from . import tensor
 from ..initializer import init_on_cpu
-from ..framework import default_main_program, Parameter
+from ..framework import default_main_program, Parameter, unique_name
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps):
     Returns:
         The decayed learning rate.
     """
-    global_step = _decay_step_counter(1)
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter(1)
 
-    a = global_step**-0.5
-    b = (warmup_steps**-1.5) * global_step
-    lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
 
     return lr_value
 
@@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
           sgd_optimizer.minimize(avg_cost)
 
     """
-    global_step = _decay_step_counter()
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = ops.floor(div_res)
-    decayed_lr = learning_rate * (decay_rate**div_res)
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
 
-    return decayed_lr
+        return decayed_lr
 
 
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     Returns:
         The decayed learning rate
     """
-    global_step = _decay_step_counter()
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = ops.floor(div_res)
-    decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
+        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
 
-    return decayed_lr
+        return decayed_lr
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
                     staircase=True))
           sgd_optimizer.minimize(avg_cost)
     """
-    global_step = _decay_step_counter()
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = ops.floor(div_res)
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = ops.floor(div_res)
 
-    decayed_lr = learning_rate / (1 + decay_rate * div_res)
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
-    return decayed_lr
+        return decayed_lr
 
 
 def polynomial_decay(learning_rate,
@@ -220,25 +224,28 @@ def polynomial_decay(learning_rate,
     Returns:
         Variable: The decayed learning rate
     """
-    global_step = _decay_step_counter()
-
-    if cycle:
-        div_res = ops.ceil(global_step / decay_steps)
-        zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        with control_flow.Switch() as switch:
-            with switch.case(global_step == zero_var):
-                tensor.assign(input=one_var, output=div_res)
-        decay_steps = decay_steps * div_res
-    else:
-        decay_steps_var = tensor.fill_constant(
-            shape=[1], dtype='float32', value=float(decay_steps))
-        global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
+    with default_main_program()._lr_schedule_guard():
+        global_step = _decay_step_counter()
+
+        if cycle:
+            div_res = ops.ceil(global_step / decay_steps)
+            zero_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    tensor.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
 
-    decayed_lr = (learning_rate - end_learning_rate) * \
-        ((1 - global_step / decay_steps) ** power) + end_learning_rate
-    return decayed_lr
+        decayed_lr = (learning_rate - end_learning_rate) * \
+            ((1 - global_step / decay_steps) ** power) + end_learning_rate
+        return decayed_lr
 
 
 def piecewise_decay(boundaries, values):
@@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values):
 
 
     """
+    with default_main_program()._lr_schedule_guard():
+        if len(values) - len(boundaries) != 1:
+            raise ValueError("len(values) - len(boundaries) should be 1")
 
-    if len(values) - len(boundaries) != 1:
-        raise ValueError("len(values) - len(boundaries) should be 1")
-
-    global_step = _decay_step_counter()
+        global_step = _decay_step_counter()
 
-    lr = tensor.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
+        lr = tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
 
-    with control_flow.Switch() as switch:
-        for i in range(len(boundaries)):
-            boundary_val = tensor.fill_constant(
+        with control_flow.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = tensor.fill_constant(
+                    shape=[1],
+                    dtype='float32',
+                    value=float(boundaries[i]),
+                    force_cpu=True)
+                value_var = tensor.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(global_step < boundary_val):
+                    tensor.assign(value_var, lr)
+            last_value_var = tensor.fill_constant(
                 shape=[1],
                 dtype='float32',
-                value=float(boundaries[i]),
-                force_cpu=True)
-            value_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(global_step < boundary_val):
-                tensor.assign(value_var, lr)
-        last_value_var = tensor.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            tensor.assign(last_value_var, lr)
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                tensor.assign(last_value_var, lr)
 
     return lr
 
 
@@ -22,7 +22,7 @@ def _setup_config(self):
         self._sync_mode = True
         self._use_reduce = False
 
-    def test_se_resnext(self):
+    def test_dist_train(self):
         self.check_with_place("dist_mnist.py", delta=1e-7)
 
 
@@ -31,7 +31,7 @@ def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
 
-    def test_se_resnext(self):
+    def test_dist_train(self):
         self.check_with_place("dist_mnist.py", delta=1e-7)
 
 
@@ -40,7 +40,7 @@ def _setup_config(self):
         self._sync_mode = False
         self._use_reduce = False
 
-    def test_se_resnext(self):
+    def test_dist_train(self):
         self.check_with_place("dist_mnist.py", delta=200)