Skip to content

Commit e42f9b7

Browse files
authored
Merge pull request #12103 from jacquesqiao/fix-optimizer-accumulator
Fix optimizer accumulator
2 parents 72ce4d5 + 2d2e813 commit e42f9b7

File tree

2 files changed

+67
-64
lines changed

2 files changed

+67
-64
lines changed

python/paddle/fluid/optimizer.py

Lines changed: 65 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def _create_accumulators(self, block, parameters):
123123
"""
124124
pass
125125

126-
def _finish_update(self, block):
126+
def _finish_update(self, block, parameters):
127127
"""Finish any custom updates needed
128128
before completing an optimization step
129129
@@ -132,7 +132,7 @@ def _finish_update(self, block):
132132
parameters: list of parameter variables for the optimizer
133133
134134
Returns:
135-
list of finish ops or None
135+
None
136136
"""
137137
pass
138138

@@ -236,7 +236,8 @@ def _create_optimization_pass(self,
236236

237237
# Get custom finish ops for subclasses
238238
# FIXME: Need to fix this once we figure out how to handle dependencies
239-
self._finish_update(loss.block)
239+
self._finish_update(loss.block,
240+
[p[0] for p in parameters_and_grads])
240241

241242
end = len(global_block.ops)
242243
return global_block.slice_ops(start, end)
@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
486487
"""
487488
_moment1_acc_str = "moment1"
488489
_moment2_acc_str = "moment2"
490+
_beta1_pow_acc_str = "beta1_pow_acc"
491+
_beta2_pow_acc_str = "beta2_pow_acc"
489492

490493
def __init__(self,
491494
learning_rate=0.001,
@@ -507,32 +510,22 @@ def __init__(self,
507510
def _create_accumulators(self, block, parameters):
508511
assert isinstance(block, framework.Block)
509512

510-
main_block = block.program.global_block()
511-
# Create beta1 and beta2 power tensors
512-
beta_shape = [1]
513-
self._beta1_pow_acc = self.helper.create_global_variable(
514-
name=unique_name.generate('beta1_pow_acc'),
515-
dtype='float32' if self._dtype == None else self._dtype,
516-
shape=beta_shape,
517-
lod_level=0,
518-
persistable=True)
519-
self.helper.set_variable_initializer(
520-
self._beta1_pow_acc, initializer=Constant(self._beta1))
521-
522-
self._beta2_pow_acc = self.helper.create_global_variable(
523-
name=unique_name.generate('beta2_pow_acc'),
524-
dtype='float32' if self._dtype == None else self._dtype,
525-
shape=beta_shape,
526-
lod_level=0,
527-
persistable=True)
528-
529-
self.helper.set_variable_initializer(
530-
self._beta2_pow_acc, initializer=Constant(self._beta2))
531-
532513
# Create accumulator tensors for first and second moments
533514
for p in parameters:
534515
self._add_accumulator(self._moment1_acc_str, p)
535516
self._add_accumulator(self._moment2_acc_str, p)
517+
self._add_accumulator(
518+
name=self._beta1_pow_acc_str,
519+
param=p,
520+
dtype='float32',
521+
fill_value=self._beta1,
522+
shape=[1])
523+
self._add_accumulator(
524+
name=self._beta2_pow_acc_str,
525+
param=p,
526+
dtype='float32',
527+
fill_value=self._beta2,
528+
shape=[1])
536529

537530
def _append_optimize_op(self, block, param_and_grad):
538531
assert isinstance(block, framework.Block)
@@ -541,6 +534,11 @@ def _append_optimize_op(self, block, param_and_grad):
541534
param_and_grad[0])
542535
moment2 = self._get_accumulator(self._moment2_acc_str,
543536
param_and_grad[0])
537+
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
538+
param_and_grad[0])
539+
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
540+
param_and_grad[0])
541+
544542
# create the adam optimize op
545543
adam_op = block.append_op(
546544
type=self.type,
@@ -550,8 +548,8 @@ def _append_optimize_op(self, block, param_and_grad):
550548
"LearningRate": self._create_param_lr(param_and_grad),
551549
"Moment1": moment1,
552550
"Moment2": moment2,
553-
"Beta1Pow": self._beta1_pow_acc,
554-
"Beta2Pow": self._beta2_pow_acc
551+
"Beta1Pow": beta1_pow_acc,
552+
"Beta2Pow": beta2_pow_acc
555553
},
556554
outputs={
557555
"ParamOut": param_and_grad[0],
@@ -566,24 +564,28 @@ def _append_optimize_op(self, block, param_and_grad):
566564

567565
return adam_op
568566

569-
def _finish_update(self, block):
567+
def _finish_update(self, block, parameters):
570568
"""Update Beta1 and Beta2 Power accumulators
571569
"""
572570
assert isinstance(block, framework.Block)
573571
main_block = block.program.global_block()
574-
scale_beta1 = main_block.append_op(
575-
type="scale",
576-
inputs={"X": self._beta1_pow_acc},
577-
outputs={"Out": self._beta1_pow_acc},
578-
attrs={"scale": self._beta1})
579-
580-
scale_beta2 = main_block.append_op(
581-
type="scale",
582-
inputs={"X": self._beta2_pow_acc},
583-
outputs={"Out": self._beta2_pow_acc},
584-
attrs={"scale": self._beta2})
585-
586-
return [scale_beta1, scale_beta2]
572+
for param in parameters:
573+
with param.block.program.optimized_guard(param):
574+
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
575+
param)
576+
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
577+
param)
578+
main_block.append_op(
579+
type="scale",
580+
inputs={"X": beta1_pow_acc},
581+
outputs={"Out": beta1_pow_acc},
582+
attrs={"scale": self._beta1})
583+
584+
main_block.append_op(
585+
type="scale",
586+
inputs={"X": beta2_pow_acc},
587+
outputs={"Out": beta2_pow_acc},
588+
attrs={"scale": self._beta2})
587589

588590

589591
class AdamaxOptimizer(Optimizer):
@@ -626,6 +628,7 @@ class AdamaxOptimizer(Optimizer):
626628
"""
627629
_moment_acc_str = "moment"
628630
_inf_norm_acc_str = "inf_norm"
631+
_beta1_pow_acc_str = "beta1_pow_acc"
629632

630633
def __init__(self,
631634
learning_rate=0.001,
@@ -645,28 +648,25 @@ def __init__(self,
645648
self._epsilon = epsilon
646649

647650
def _create_accumulators(self, block, parameters):
648-
# Create beta1 power accumulator tensor
649-
beta_shape = [1]
650-
self._beta1_pow_acc = self.helper.create_global_variable(
651-
name=unique_name.generate('beta1_pow_acc'),
652-
dtype='float32' if self._dtype == None else self._dtype,
653-
shape=beta_shape,
654-
lod_level=0,
655-
persistable=True)
656-
self.helper.set_variable_initializer(
657-
self._beta1_pow_acc, initializer=Constant(self._beta1))
658-
659651
# Create accumulator tensors for first moment and infinity norm
660652
for p in parameters:
661653
self._add_accumulator(self._moment_acc_str, p)
662654
self._add_accumulator(self._inf_norm_acc_str, p)
655+
self._add_accumulator(
656+
name=self._beta1_pow_acc_str,
657+
param=p,
658+
dtype='float32',
659+
fill_value=self._beta1,
660+
shape=[1])
663661

664662
def _append_optimize_op(self, block, param_and_grad):
665663
assert isinstance(block, framework.Block)
666664

667665
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
668666
inf_norm = self._get_accumulator(self._inf_norm_acc_str,
669667
param_and_grad[0])
668+
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
669+
param_and_grad[0])
670670
# create the adamax optimize op
671671
adamax_op = block.append_op(
672672
type=self.type,
@@ -676,7 +676,7 @@ def _append_optimize_op(self, block, param_and_grad):
676676
"LearningRate": self._create_param_lr(param_and_grad),
677677
"Moment": moment,
678678
"InfNorm": inf_norm,
679-
"Beta1Pow": self._beta1_pow_acc
679+
"Beta1Pow": beta1_pow_acc
680680
},
681681
outputs={
682682
"ParamOut": param_and_grad[0],
@@ -691,18 +691,20 @@ def _append_optimize_op(self, block, param_and_grad):
691691

692692
return adamax_op
693693

694-
def _finish_update(self, block):
694+
def _finish_update(self, block, parameters):
695695
"""Update Beta1 Power accumulator
696696
"""
697697
assert isinstance(block, framework.Block)
698698
main_block = block.program.global_block()
699-
scale_beta1 = main_block.append_op(
700-
type="scale",
701-
inputs={"X": self._beta1_pow_acc},
702-
outputs={"Out": self._beta1_pow_acc},
703-
attrs={"scale": self._beta1})
704-
705-
return [scale_beta1]
699+
for param in parameters:
700+
with param.block.program.optimized_guard(param):
701+
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
702+
param)
703+
main_block.append_op(
704+
type="scale",
705+
inputs={"X": beta1_pow_acc},
706+
outputs={"Out": beta1_pow_acc},
707+
attrs={"scale": self._beta1})
706708

707709

708710
class DecayedAdagradOptimizer(Optimizer):
@@ -1156,7 +1158,8 @@ def __init__(self,
11561158
self.params_grads.append((param, grad))
11571159

11581160
for param, grad in self.params_grads:
1159-
self._append_average_accumulate_op(param)
1161+
with param.block.program.optimized_guard(param):
1162+
self._append_average_accumulate_op(param)
11601163

11611164
self.apply_program = Program()
11621165
block = self.apply_program.global_block()

python/paddle/fluid/tests/unittests/test_optimizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ def test_adam_optimizer(self):
287287

288288
# Check accumulators
289289
accumulators = adam_optimizer.get_accumulators()
290-
self.assertEqual(len(accumulators), 2)
290+
self.assertEqual(len(accumulators), 4)
291291
self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
292292
self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
293293
moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
@@ -354,7 +354,7 @@ def test_adamax_optimizer(self):
354354

355355
# Check accumulators
356356
accumulators = adamax_optimizer.get_accumulators()
357-
self.assertEqual(len(accumulators), 2)
357+
self.assertEqual(len(accumulators), 3)
358358
self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
359359
self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
360360
moment_acc = accumulators[adamax_optimizer.get_moment_str()]

0 commit comments

Comments
 (0)