[Cherry pick] Remove unnecessary op when trainable is false (#19434)

chengduoZH · web-flow · commit 9048229be466 · 2019-08-28T13:17:04.000+08:00
* fix optimizer bug
test=develop
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
@@ -712,8 +712,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         parameters = parameter_list
     else:
         params = program.global_block().all_parameters()
-        program.global_block().iter_parameters()
-        parameters = [param.name for param in params]
+        parameters = [param.name for param in params if param.trainable]
 
     params_and_grads = []
     for param in parameters:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
@@ -360,8 +360,9 @@ def _create_optimization_pass(self, parameters_and_grads):
         global_block = framework.default_main_program().global_block()
         start = len(global_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
-        self._create_accumulators(global_block,
-                                  [p[0] for p in parameters_and_grads])
+        self._create_accumulators(
+            global_block,
+            [p[0] for p in parameters_and_grads if p[0].trainable])
         self._create_global_learning_rate()
 
         optimize_ops = []
@@ -587,6 +588,20 @@ def minimize(self,
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
         """
+        assert isinstance(loss, Variable), "The loss should be an Variable."
+        if no_grad_set is None:
+            no_grad_set = set()
+        elif isinstance(no_grad_set, set) or isinstance(
+                no_grad_set, list) or isinstance(no_grad_set, tuple):
+            no_grad_set = set(no_grad_set)
+        else:
+            assert "no_grad_set should be a set, but the passed type is {}".format(
+                type(no_grad_set))
+        parameters = loss.block.program.global_block().all_parameters()
+        param_no_trainable = set(
+            [param.name for param in parameters if param.trainable is False])
+        # If the parameter is no trainable, it should not have a gradient.
+        no_grad_set.update(param_no_trainable)
         params_grads = self.backward(
             loss,
             startup_program=startup_program,
@@ -1390,7 +1405,7 @@ def _finish_update(self, block, param_and_grads):
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
         for param, grad in param_and_grads:
-            if grad is None:
+            if grad is None or param.trainable is False:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope("optimizer"):
@@ -1553,7 +1568,7 @@ def _finish_update(self, block, parameters_and_grads):
         assert isinstance(block, framework.Block)
         main_block = block.program.global_block()
         for param, grad in parameters_and_grads:
-            if grad is None:
+            if grad is None or param.trainable is False:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope('adamx'):
diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from collections import Counter
+import unittest
+import paddle.fluid as fluid
+from simple_nets import init_data
+
+
+def test_trainable():
+    x = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feature = fluid.layers.fc(input=x,
+                              size=10,
+                              param_attr=fluid.ParamAttr(trainable=False))
+    loss = fluid.layers.cross_entropy(input=feature, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestTrainable(unittest.TestCase):
+    def check_trainable(self,
+                        model,
+                        feed_dict,
+                        op_count,
+                        optimizer=fluid.optimizer.Adam()):
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        startup = fluid.Program()
+
+        with fluid.program_guard(main, startup):
+            loss = model()
+            optimizer.minimize(loss)
+
+            # The number of adam should be one.
+            ops = Counter([op.type for op in main.global_block().ops])
+            for op in op_count:
+                if op_count[op] == 0:
+                    assert op not in ops
+                else:
+                    assert ops[op] == op_count[op]
+
+            exe.run(fluid.default_startup_program())
+            exe.run(feed=feed_dict)
+
+    def test_trainable(self):
+        batch_size = 2
+        img, label = init_data(batch_size, img_shape=[784], label_range=9)
+        feed_dict = {'image': img, 'label': label}
+        # Note that, because the Weight of FC is not trainable and the x is stop_gradient,
+        # so the 'mul_grad' should not be appended.
+        self.check_trainable(
+            test_trainable,
+            feed_dict,
+            op_count={'adam': 1,
+                      'scale': 2,
+                      'mul_grad': 0})
+        self.check_trainable(
+            test_trainable,
+            feed_dict,
+            op_count={'adamax': 1,
+                      'scale': 1,
+                      'mul_grad': 0},
+            optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
+
+
+if __name__ == '__main__':
+    unittest.main()