[cherry-pick] add bn momentum variable (#21435)

heavengate · web-flow · commit 9c63b7c19925 · 2019-12-03T11:32:40.000+08:00
* batch_norm momentum support variable. test=develop
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
@@ -58,6 +58,13 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
+  if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) {
+    auto mom = ctx->Inputs("MomentumTensor");
+    PADDLE_ENFORCE_EQ(mom.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "Input(MomentumTensor) size must be 1"));
+  }
+
   PADDLE_ENFORCE_GE(
       x_dims.size(), 2,
       "ShapeError: the dimension of input X must greater than or equal to 2."
@@ -173,6 +180,11 @@ void BatchNormOpMaker::Make() {
   AddInput("Variance",
            "The global variance (for training) "
            "or estimated Variance (for testing)");
+  AddInput("MomentumTensor",
+           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "use this as momentum, this has a higher priority than "
+           "attr(momentum), the shape of this tensor MUST BE [1].")
+      .AsDispensable();
   AddOutput("Y", "result after normalization");
   AddOutput("MeanOut",
             "Share memory with Mean. "
@@ -221,7 +233,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    const float momentum = ctx.Attr<float>("momentum");
+    float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
     const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
 
@@ -306,6 +318,13 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
+      // if MomentumTensor is set, use MomentumTensor value, momentum
+      // is only used in this training branch
+      if (ctx.HasInput("MomentumTensor")) {
+        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        momentum = mom_tensor->data<float>()[0];
+      }
+
       running_mean_arr =
           running_mean_arr * momentum + saved_mean_e * (1. - momentum);
       running_var_arr =
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
@@ -43,7 +43,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const float momentum = ctx.Attr<float>("momentum");
+    float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
     const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -133,6 +133,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           est_mean->template data<BatchNormParamType<T>>(),
           est_var->template data<BatchNormParamType<T>>(), epsilon));
     } else {
+      // if MomentumTensor is set, use MomentumTensor value, momentum
+      // is only used in this training branch
+      if (ctx.HasInput("MomentumTensor")) {
+        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        Tensor mom_cpu;
+        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        momentum = mom_cpu.data<float>()[0];
+      }
+
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -4176,13 +4176,14 @@ def batch_norm(input,
         sync_batch_norm automatically.
 
     Args:
-        input(variable): The rank of input variable can be 2, 3, 4, 5. The data type 
+        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type 
             is float16 or float32 or float64.
         act(string, Default None): Activation type, linear|relu|prelu|...
         is_test (bool, Default False): A flag indicating whether it is in
             test phrase or not.
-        momentum(float, Default 0.9): The value used for the moving_mean and
-            moving_var computation. The updated formula is:
+        momentum(float|Variable, Default 0.9): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Variable with
+            shape [1] and data type as float32. The updated formula is:
             :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
             :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
             Default is 0.9.
@@ -4228,6 +4229,33 @@ def batch_norm(input,
             x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
             hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
             hidden2 = fluid.layers.batch_norm(input=hidden1)
+
+        .. code-block:: python
+
+            # batch_norm with momentum as Variable
+            import paddle.fluid as fluid
+            import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
+
+            def get_decay_momentum(momentum_init, decay_steps, decay_rate):
+                global_step = lr_scheduler._decay_step_counter()
+                momentum = fluid.layers.create_global_var(
+		    shape=[1],
+		    value=float(momentum_init),
+		    dtype='float32',
+		    # set persistable for save checkpoints and resume
+		    persistable=True,
+		    name="momentum")
+                div_res = global_step / decay_steps
+                decayed_momentum = momentum_init * (decay_rate**div_res)
+                fluid.layers.assign(decayed_momentum, momentum)
+
+                return momentum
+
+            x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            momentum = get_decay_momentum(0.9, 1e5, 0.9)
+            hidden2 = fluid.layers.batch_norm(input=hidden1, momentum=momentum)
+
     """
     assert bias_attr is not False, "bias_attr should not be False in batch_norm."
     helper = LayerHelper('batch_norm', **locals())
@@ -4303,31 +4331,36 @@ def batch_norm(input,
     batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
         dtype)
 
+    inputs = {
+        "X": input,
+        "Scale": scale,
+        "Bias": bias,
+        "Mean": mean,
+        "Variance": variance
+    }
+    attrs = {
+        "epsilon": epsilon,
+        "is_test": is_test,
+        "data_layout": data_layout,
+        "use_mkldnn": False,
+        "fuse_with_relu": False,
+        "use_global_stats": use_global_stats
+    }
+    if isinstance(momentum, Variable):
+        inputs['MomemtumTensor'] = momentum
+    else:
+        attrs['momentum'] = momentum
     helper.append_op(
         type="batch_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-            "Mean": mean,
-            "Variance": variance
-        },
+        inputs=inputs,
         outputs={
             "Y": batch_norm_out,
             "MeanOut": mean_out,
             "VarianceOut": variance_out,
             "SavedMean": saved_mean,
             "SavedVariance": saved_variance
         },
-        attrs={
-            "momentum": momentum,
-            "epsilon": epsilon,
-            "is_test": is_test,
-            "data_layout": data_layout,
-            "use_mkldnn": False,
-            "fuse_with_relu": False,
-            "use_global_stats": use_global_stats
-        })
+        attrs=attrs)
 
     return helper.append_activation(batch_norm_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -310,6 +310,7 @@ def setUp(self):
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
         self.momentum = 0.9
+        self.use_momentum_variable = False
         self.epsilon = 0.00001
         self.init_kernel_type()
         self.init_test_case()
@@ -367,6 +368,7 @@ def test_with_place(place, data_layout, shape):
             bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
             y_grad = np.random.random_sample(shape).astype(np.float32)
+            momentum_var = np.array([momentum]).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
                 x, y_grad, scale, bias, mean, variance, epsilon, momentum,
@@ -380,7 +382,7 @@ def test_with_place(place, data_layout, shape):
 
             var_names = [
                 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-                'saved_variance'
+                'saved_variance', 'momentum_var'
             ]
             ground_truth = {name: var_dict[name] for name in var_names}
 
@@ -392,31 +394,36 @@ def test_with_place(place, data_layout, shape):
                         name=name,
                         dtype='float32',
                         shape=ground_truth[name].shape)
+                inputs = {
+                    "X": block.var('x'),
+                    "Scale": block.var('scale'),
+                    "Bias": block.var('bias'),
+                    "Mean": block.var('mean'),
+                    "Variance": block.var('variance')
+                }
+                attrs = {
+                    "epsilon": epsilon,
+                    "is_test": False,
+                    "data_layout": data_layout,
+                    "use_mkldnn": self.use_mkldnn,
+                    "fuse_with_relu": self.fuse_with_relu,
+                    "use_global_stats": self.use_global_stats
+                }
+                if self.use_momentum_variable:
+                    inputs['MomentumTensor'] = block.var('momentum_var')
+                else:
+                    attrs['momentum'] = momentum
                 bn_op = block.append_op(
                     type="batch_norm",
-                    inputs={
-                        "X": block.var('x'),
-                        "Scale": block.var('scale'),
-                        "Bias": block.var('bias'),
-                        "Mean": block.var('mean'),
-                        "Variance": block.var('variance')
-                    },
+                    inputs=inputs,
                     outputs={
                         "Y": block.var('y'),
                         "MeanOut": block.var('mean'),  # share memory
                         "VarianceOut": block.var('variance'),  # share memory
                         "SavedMean": block.var('saved_mean'),
                         "SavedVariance": block.var('saved_variance')
                     },
-                    attrs={
-                        "momentum": momentum,
-                        "epsilon": epsilon,
-                        "is_test": False,
-                        "data_layout": data_layout,
-                        "use_mkldnn": self.use_mkldnn,
-                        "fuse_with_relu": self.fuse_with_relu,
-                        "use_global_stats": self.use_global_stats
-                    })
+                    attrs=attrs)
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
@@ -434,14 +441,15 @@ def test_with_place(place, data_layout, shape):
                     grad_var.set_dtype(core.VarDesc.VarType.FP32)
 
                 exe = fluid.Executor(place)
-                out = exe.run(
-                    program,
-                    feed={
-                        name: var_dict[name]
-                        for name in
-                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
-                    },
-                    fetch_list=self.fetch_list)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in [
+                                      'x', 'scale', 'bias', 'mean', 'variance',
+                                      'y@GRAD', 'momentum_var'
+                                  ]
+                              },
+                              fetch_list=self.fetch_list)
 
             for id, name in enumerate(self.fetch_list):
                 if name == 'variance':
@@ -471,6 +479,17 @@ def init_test_case(self):
         self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
 
 
+class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_momentum_variable = True
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
+
+
 class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
     def init_test_case(self):
         self.use_global_stats = True
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2465,6 +2465,19 @@ def make_batch_norm(self):
             out = layers.batch_norm(data)
             return (out)
 
+    def make_batch_norm_momentum_variable(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            data = self._get_data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            momentum = self._get_data(
+                name='momentum',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
+            out = layers.batch_norm(data, momentum=momentum)
+            return (out)
+
     def make_range(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):