Skip to content

Commit 0231f58

Browse files
authored
Fix double_grad bug in statig-graph (#24190) (#24286)
* Rename internal gradient variables in multiple backward * so that they have different names with previous backward * For example: * y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x) * In second-time backward, gradient variable names of partial * forward network (y * y) may be have same names with first-time * fluid.gradients(y, x). test=develop
1 parent 55fc501 commit 0231f58

File tree

2 files changed

+96
-8
lines changed

2 files changed

+96
-8
lines changed

python/paddle/fluid/backward.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,19 @@ def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set):
827827
return sub_block.ops
828828

829829

830+
def _is_grad_op_(op):
831+
op_maker = core.op_proto_and_checker_maker
832+
backward = core.op_proto_and_checker_maker.OpRole.Backward
833+
if op_maker.kOpRoleVarAttrName() in op.attr_names and \
834+
int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(backward):
835+
return True
836+
return False
837+
838+
839+
def _rename_grad_name_(name, grad_order):
840+
return 'grad/' * grad_order + name
841+
842+
830843
def _append_backward_ops_(block,
831844
ops,
832845
target_block,
@@ -862,6 +875,8 @@ def _append_backward_ops_(block,
862875
grad_op_descs = []
863876
program = block.program
864877

878+
rename_var_map = {}
879+
865880
# add grad_op_desc by reversed ops
866881
for op in reversed(ops):
867882
grad_sub_block_list = []
@@ -894,6 +909,33 @@ def _append_backward_ops_(block,
894909
for op_desc in grad_op_desc:
895910
op_desc._set_attr(device_attr_name, op_device)
896911

912+
# Rename internal gradient variables in multiple backward
913+
# so that they have different names with previous backward.
914+
# For example:
915+
# y = x * x, grad = fluid.gradients(fluid.gradients(y, x) + y * y, x)
916+
# In second-time backward, gradient variable names of partial
917+
# forward network (y * y) may be have same names with first-time
918+
# fluid.gradients(y, x).
919+
# So rename here before _addup_repetitive_outputs_.
920+
if program._appending_grad_times > 1:
921+
for op_desc in grad_op_desc:
922+
if not _is_grad_op_(op):
923+
for name in op_desc.input_arg_names():
924+
if name in rename_var_map:
925+
op_desc._rename_input(name, rename_var_map[name])
926+
for name in op_desc.output_arg_names():
927+
if "@GRAD" not in name:
928+
continue
929+
if block.desc.find_var(name.encode("ascii")):
930+
new_name = _rename_grad_name_(
931+
name, program._appending_grad_times)
932+
op_desc._rename_output(name, new_name)
933+
rename_var_map[name] = new_name
934+
935+
if name in op_grad_to_var:
936+
op_grad_to_var[new_name] = op_grad_to_var[name]
937+
op_grad_to_var.pop(name)
938+
897939
# If input_grad_names_set is not None, extend grad_op_descs only when
898940
# any input grad in outputs of previous grad ops.
899941
# But this strategy is not suited for while op for some control flow,

python/paddle/fluid/tests/unittests/test_calc_gradient.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,62 @@
2323

2424
class TestCalcGradient(unittest.TestCase):
2525
def test_calc_gradient(self):
26-
x = layers.create_parameter(dtype="float32", shape=[5, 10])
27-
y = layers.create_parameter(dtype="float32", shape=[10, 8])
28-
mul_out = layers.mul(x=x, y=y)
29-
mean_out = layers.mean(mul_out)
30-
a = calc_gradient(mean_out, mul_out)
31-
b = calc_gradient(mean_out, x)
26+
main = fluid.Program()
27+
startup = fluid.Program()
28+
with fluid.program_guard(main, startup):
29+
x = layers.create_parameter(dtype="float32", shape=[5, 10])
30+
y = layers.create_parameter(dtype="float32", shape=[10, 8])
31+
mul_out = layers.mul(x=x, y=y)
32+
mean_out = layers.mean(mul_out)
33+
a = calc_gradient(mean_out, mul_out)
34+
b = calc_gradient(mean_out, x)
3235
place = fluid.CPUPlace()
3336
exe = fluid.Executor(place)
34-
exe.run(fluid.default_startup_program())
35-
exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
37+
exe.run(startup)
38+
exe.run(main, feed={}, fetch_list=[a, b])
39+
40+
41+
class TestDoubleGrad(unittest.TestCase):
42+
def test1(self):
43+
main = fluid.Program()
44+
startup = fluid.Program()
45+
with fluid.program_guard(main, startup):
46+
net = lambda x: x * x
47+
x = fluid.layers.create_parameter(
48+
name='x',
49+
shape=[1],
50+
dtype='float32',
51+
default_initializer=fluid.initializer.Constant(3))
52+
grad1, = fluid.gradients(net(x), x) # 2x = 6
53+
z = net(x - grad1)
54+
grad2, = fluid.gradients(z, x) # gradients( (x - 2x)^2) = 2x = 6
55+
56+
place = fluid.CPUPlace()
57+
exe = fluid.Executor(place)
58+
exe.run(startup)
59+
out = exe.run(main, fetch_list=[grad1.name, grad2.name])
60+
self.assertEqual(6, out[0][0])
61+
self.assertEqual(6, out[1][0])
62+
63+
def test2(self):
64+
main = fluid.Program()
65+
startup = fluid.Program()
66+
with fluid.program_guard(main, startup):
67+
x = fluid.layers.create_parameter(
68+
name='x',
69+
shape=[1],
70+
dtype='float32',
71+
default_initializer=fluid.initializer.Constant(1))
72+
y = x * x
73+
dx1, = fluid.gradients(y, x)
74+
z = dx1 * dx1 + y * y
75+
dx2, = fluid.gradients(z, x)
76+
77+
place = fluid.CPUPlace()
78+
exe = fluid.Executor(place)
79+
exe.run(startup)
80+
out, = exe.run(main, fetch_list=[dx2])
81+
self.assertEqual(12, out[0])
3682

3783

3884
if __name__ == "__main__":

0 commit comments

Comments
 (0)