@@ -123,7 +123,7 @@ def _create_accumulators(self, block, parameters):
123
123
"""
124
124
pass
125
125
126
- def _finish_update (self , block ):
126
+ def _finish_update (self , block , parameters ):
127
127
"""Finish any custom updates needed
128
128
before completing an optimization step
129
129
@@ -132,7 +132,7 @@ def _finish_update(self, block):
132
132
parameters: list of parameter variables for the optimizer
133
133
134
134
Returns:
135
- list of finish ops or None
135
+ None
136
136
"""
137
137
pass
138
138
@@ -236,7 +236,8 @@ def _create_optimization_pass(self,
236
236
237
237
# Get custom finish ops for subclasses
238
238
# FIXME: Need to fix this once we figure out how to handle dependencies
239
- self ._finish_update (loss .block )
239
+ self ._finish_update (loss .block ,
240
+ [p [0 ] for p in parameters_and_grads ])
240
241
241
242
end = len (global_block .ops )
242
243
return global_block .slice_ops (start , end )
@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
486
487
"""
487
488
_moment1_acc_str = "moment1"
488
489
_moment2_acc_str = "moment2"
490
+ _beta1_pow_acc_str = "beta1_pow_acc"
491
+ _beta2_pow_acc_str = "beta2_pow_acc"
489
492
490
493
def __init__ (self ,
491
494
learning_rate = 0.001 ,
@@ -507,32 +510,22 @@ def __init__(self,
507
510
def _create_accumulators (self , block , parameters ):
508
511
assert isinstance (block , framework .Block )
509
512
510
- main_block = block .program .global_block ()
511
- # Create beta1 and beta2 power tensors
512
- beta_shape = [1 ]
513
- self ._beta1_pow_acc = self .helper .create_global_variable (
514
- name = unique_name .generate ('beta1_pow_acc' ),
515
- dtype = 'float32' if self ._dtype == None else self ._dtype ,
516
- shape = beta_shape ,
517
- lod_level = 0 ,
518
- persistable = True )
519
- self .helper .set_variable_initializer (
520
- self ._beta1_pow_acc , initializer = Constant (self ._beta1 ))
521
-
522
- self ._beta2_pow_acc = self .helper .create_global_variable (
523
- name = unique_name .generate ('beta2_pow_acc' ),
524
- dtype = 'float32' if self ._dtype == None else self ._dtype ,
525
- shape = beta_shape ,
526
- lod_level = 0 ,
527
- persistable = True )
528
-
529
- self .helper .set_variable_initializer (
530
- self ._beta2_pow_acc , initializer = Constant (self ._beta2 ))
531
-
532
513
# Create accumulator tensors for first and second moments
533
514
for p in parameters :
534
515
self ._add_accumulator (self ._moment1_acc_str , p )
535
516
self ._add_accumulator (self ._moment2_acc_str , p )
517
+ self ._add_accumulator (
518
+ name = self ._beta1_pow_acc_str ,
519
+ param = p ,
520
+ dtype = 'float32' ,
521
+ fill_value = self ._beta1 ,
522
+ shape = [1 ])
523
+ self ._add_accumulator (
524
+ name = self ._beta2_pow_acc_str ,
525
+ param = p ,
526
+ dtype = 'float32' ,
527
+ fill_value = self ._beta2 ,
528
+ shape = [1 ])
536
529
537
530
def _append_optimize_op (self , block , param_and_grad ):
538
531
assert isinstance (block , framework .Block )
@@ -541,6 +534,11 @@ def _append_optimize_op(self, block, param_and_grad):
541
534
param_and_grad [0 ])
542
535
moment2 = self ._get_accumulator (self ._moment2_acc_str ,
543
536
param_and_grad [0 ])
537
+ beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
538
+ param_and_grad [0 ])
539
+ beta2_pow_acc = self ._get_accumulator (self ._beta2_pow_acc_str ,
540
+ param_and_grad [0 ])
541
+
544
542
# create the adam optimize op
545
543
adam_op = block .append_op (
546
544
type = self .type ,
@@ -550,8 +548,8 @@ def _append_optimize_op(self, block, param_and_grad):
550
548
"LearningRate" : self ._create_param_lr (param_and_grad ),
551
549
"Moment1" : moment1 ,
552
550
"Moment2" : moment2 ,
553
- "Beta1Pow" : self . _beta1_pow_acc ,
554
- "Beta2Pow" : self . _beta2_pow_acc
551
+ "Beta1Pow" : beta1_pow_acc ,
552
+ "Beta2Pow" : beta2_pow_acc
555
553
},
556
554
outputs = {
557
555
"ParamOut" : param_and_grad [0 ],
@@ -566,24 +564,28 @@ def _append_optimize_op(self, block, param_and_grad):
566
564
567
565
return adam_op
568
566
569
- def _finish_update (self , block ):
567
+ def _finish_update (self , block , parameters ):
570
568
"""Update Beta1 and Beta2 Power accumulators
571
569
"""
572
570
assert isinstance (block , framework .Block )
573
571
main_block = block .program .global_block ()
574
- scale_beta1 = main_block .append_op (
575
- type = "scale" ,
576
- inputs = {"X" : self ._beta1_pow_acc },
577
- outputs = {"Out" : self ._beta1_pow_acc },
578
- attrs = {"scale" : self ._beta1 })
579
-
580
- scale_beta2 = main_block .append_op (
581
- type = "scale" ,
582
- inputs = {"X" : self ._beta2_pow_acc },
583
- outputs = {"Out" : self ._beta2_pow_acc },
584
- attrs = {"scale" : self ._beta2 })
585
-
586
- return [scale_beta1 , scale_beta2 ]
572
+ for param in parameters :
573
+ with param .block .program .optimized_guard (param ):
574
+ beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
575
+ param )
576
+ beta2_pow_acc = self ._get_accumulator (self ._beta2_pow_acc_str ,
577
+ param )
578
+ main_block .append_op (
579
+ type = "scale" ,
580
+ inputs = {"X" : beta1_pow_acc },
581
+ outputs = {"Out" : beta1_pow_acc },
582
+ attrs = {"scale" : self ._beta1 })
583
+
584
+ main_block .append_op (
585
+ type = "scale" ,
586
+ inputs = {"X" : beta2_pow_acc },
587
+ outputs = {"Out" : beta2_pow_acc },
588
+ attrs = {"scale" : self ._beta2 })
587
589
588
590
589
591
class AdamaxOptimizer (Optimizer ):
@@ -626,6 +628,7 @@ class AdamaxOptimizer(Optimizer):
626
628
"""
627
629
_moment_acc_str = "moment"
628
630
_inf_norm_acc_str = "inf_norm"
631
+ _beta1_pow_acc_str = "beta1_pow_acc"
629
632
630
633
def __init__ (self ,
631
634
learning_rate = 0.001 ,
@@ -645,28 +648,25 @@ def __init__(self,
645
648
self ._epsilon = epsilon
646
649
647
650
def _create_accumulators (self , block , parameters ):
648
- # Create beta1 power accumulator tensor
649
- beta_shape = [1 ]
650
- self ._beta1_pow_acc = self .helper .create_global_variable (
651
- name = unique_name .generate ('beta1_pow_acc' ),
652
- dtype = 'float32' if self ._dtype == None else self ._dtype ,
653
- shape = beta_shape ,
654
- lod_level = 0 ,
655
- persistable = True )
656
- self .helper .set_variable_initializer (
657
- self ._beta1_pow_acc , initializer = Constant (self ._beta1 ))
658
-
659
651
# Create accumulator tensors for first moment and infinity norm
660
652
for p in parameters :
661
653
self ._add_accumulator (self ._moment_acc_str , p )
662
654
self ._add_accumulator (self ._inf_norm_acc_str , p )
655
+ self ._add_accumulator (
656
+ name = self ._beta1_pow_acc_str ,
657
+ param = p ,
658
+ dtype = 'float32' ,
659
+ fill_value = self ._beta1 ,
660
+ shape = [1 ])
663
661
664
662
def _append_optimize_op (self , block , param_and_grad ):
665
663
assert isinstance (block , framework .Block )
666
664
667
665
moment = self ._get_accumulator (self ._moment_acc_str , param_and_grad [0 ])
668
666
inf_norm = self ._get_accumulator (self ._inf_norm_acc_str ,
669
667
param_and_grad [0 ])
668
+ beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
669
+ param_and_grad [0 ])
670
670
# create the adamax optimize op
671
671
adamax_op = block .append_op (
672
672
type = self .type ,
@@ -676,7 +676,7 @@ def _append_optimize_op(self, block, param_and_grad):
676
676
"LearningRate" : self ._create_param_lr (param_and_grad ),
677
677
"Moment" : moment ,
678
678
"InfNorm" : inf_norm ,
679
- "Beta1Pow" : self . _beta1_pow_acc
679
+ "Beta1Pow" : beta1_pow_acc
680
680
},
681
681
outputs = {
682
682
"ParamOut" : param_and_grad [0 ],
@@ -691,18 +691,20 @@ def _append_optimize_op(self, block, param_and_grad):
691
691
692
692
return adamax_op
693
693
694
- def _finish_update (self , block ):
694
+ def _finish_update (self , block , parameters ):
695
695
"""Update Beta1 Power accumulator
696
696
"""
697
697
assert isinstance (block , framework .Block )
698
698
main_block = block .program .global_block ()
699
- scale_beta1 = main_block .append_op (
700
- type = "scale" ,
701
- inputs = {"X" : self ._beta1_pow_acc },
702
- outputs = {"Out" : self ._beta1_pow_acc },
703
- attrs = {"scale" : self ._beta1 })
704
-
705
- return [scale_beta1 ]
699
+ for param in parameters :
700
+ with param .block .program .optimized_guard (param ):
701
+ beta1_pow_acc = self ._get_accumulator (self ._beta1_pow_acc_str ,
702
+ param )
703
+ main_block .append_op (
704
+ type = "scale" ,
705
+ inputs = {"X" : beta1_pow_acc },
706
+ outputs = {"Out" : beta1_pow_acc },
707
+ attrs = {"scale" : self ._beta1 })
706
708
707
709
708
710
class DecayedAdagradOptimizer (Optimizer ):
@@ -1156,7 +1158,8 @@ def __init__(self,
1156
1158
self .params_grads .append ((param , grad ))
1157
1159
1158
1160
for param , grad in self .params_grads :
1159
- self ._append_average_accumulate_op (param )
1161
+ with param .block .program .optimized_guard (param ):
1162
+ self ._append_average_accumulate_op (param )
1160
1163
1161
1164
self .apply_program = Program ()
1162
1165
block = self .apply_program .global_block ()
0 commit comments