Fix double_grad bug in statig-graph (#24190) (#24286)

qingqing01 · web-flow · commit 0231f58e592a · 2020-04-30T16:28:10.000+08:00
* Rename internal gradient variables in multiple backward
* so that they have different names with previous backward
* For example:
*  y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x)
* In second-time backward, gradient variable names of partial
* forward network (y * y) may be have same names with first-time
* fluid.gradients(y, x).

test=develop
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
@@ -827,6 +827,19 @@ def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set):
     return sub_block.ops
 
 
+def _is_grad_op_(op):
+    op_maker = core.op_proto_and_checker_maker
+    backward = core.op_proto_and_checker_maker.OpRole.Backward
+    if op_maker.kOpRoleVarAttrName() in op.attr_names and \
+            int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward):
+        return True
+    return False
+
+
+def _rename_grad_name_(name, grad_order):
+    return 'grad/' * grad_order + name
+
+
 def _append_backward_ops_(block,
                           ops,
                           target_block,
@@ -862,6 +875,8 @@ def _append_backward_ops_(block,
     grad_op_descs = []
     program = block.program
 
+    rename_var_map = {}
+
     # add grad_op_desc by reversed ops
     for op in reversed(ops):
         grad_sub_block_list = []
@@ -894,6 +909,33 @@ def _append_backward_ops_(block,
             for op_desc in grad_op_desc:
                 op_desc._set_attr(device_attr_name, op_device)
 
+        # Rename internal gradient variables in multiple backward
+        # so that they have different names with previous backward.
+        # For example:
+        #  y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x)
+        # In second-time backward, gradient variable names of partial
+        # forward network (y * y) may be have same names with first-time
+        # fluid.gradients(y, x).
+        # So rename here before _addup_repetitive_outputs_.
+        if program._appending_grad_times > 1:
+            for op_desc in grad_op_desc:
+                if not _is_grad_op_(op):
+                    for name in op_desc.input_arg_names():
+                        if name in rename_var_map:
+                            op_desc._rename_input(name, rename_var_map[name])
+                for name in op_desc.output_arg_names():
+                    if "@GRAD" not in name:
+                        continue
+                    if block.desc.find_var(name.encode("ascii")):
+                        new_name = _rename_grad_name_(
+                            name, program._appending_grad_times)
+                        op_desc._rename_output(name, new_name)
+                        rename_var_map[name] = new_name
+
+                        if name in op_grad_to_var:
+                            op_grad_to_var[new_name] = op_grad_to_var[name]
+                            op_grad_to_var.pop(name)
+
         # If input_grad_names_set is not None, extend grad_op_descs only when
         # any input grad in outputs of previous grad ops.
         # But this strategy is not suited for while op for some control flow,
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -23,16 +23,62 @@
 
 class TestCalcGradient(unittest.TestCase):
     def test_calc_gradient(self):
-        x = layers.create_parameter(dtype="float32", shape=[5, 10])
-        y = layers.create_parameter(dtype="float32", shape=[10, 8])
-        mul_out = layers.mul(x=x, y=y)
-        mean_out = layers.mean(mul_out)
-        a = calc_gradient(mean_out, mul_out)
-        b = calc_gradient(mean_out, x)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            x = layers.create_parameter(dtype="float32", shape=[5, 10])
+            y = layers.create_parameter(dtype="float32", shape=[10, 8])
+            mul_out = layers.mul(x=x, y=y)
+            mean_out = layers.mean(mul_out)
+            a = calc_gradient(mean_out, mul_out)
+            b = calc_gradient(mean_out, x)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
+        exe.run(startup)
+        exe.run(main, feed={}, fetch_list=[a, b])
+
+
+class TestDoubleGrad(unittest.TestCase):
+    def test1(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            net = lambda x: x * x
+            x = fluid.layers.create_parameter(
+                name='x',
+                shape=[1],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(3))
+            grad1, = fluid.gradients(net(x), x)  # 2x = 6
+            z = net(x - grad1)
+            grad2, = fluid.gradients(z, x)  # gradients( (x - 2x)^2) = 2x = 6
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        out = exe.run(main, fetch_list=[grad1.name, grad2.name])
+        self.assertEqual(6, out[0][0])
+        self.assertEqual(6, out[1][0])
+
+    def test2(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            x = fluid.layers.create_parameter(
+                name='x',
+                shape=[1],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(1))
+            y = x * x
+            dx1, = fluid.gradients(y, x)
+            z = dx1 * dx1 + y * y
+            dx2, = fluid.gradients(z, x)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        out, = exe.run(main, fetch_list=[dx2])
+        self.assertEqual(12, out[0])
 
 
 if __name__ == "__main__":