1
1
import paddle .v2 .framework .framework as framework
2
2
from collections import defaultdict
3
3
4
- __all__ = ['SGDOptimizer' , 'MomentumOptimizer' , 'AdagradOptimizer' ]
4
+ __all__ = [
5
+ 'SGDOptimizer' , 'MomentumOptimizer' , 'AdagradOptimizer' , 'AdamOptimizer'
6
+ ]
5
7
6
8
7
9
class Optimizer (object ):
@@ -43,6 +45,19 @@ def _create_accumulators(self, block, parameters):
43
45
"""
44
46
pass
45
47
48
+ def _finish_update (self , block ):
49
+ """Finish any custom updates needed
50
+ before completing an optimization step
51
+
52
+ Args:
53
+ block: the block in which the loss variable is present
54
+ parameters: list of parameter variables for the optimizer
55
+
56
+ Returns:
57
+ list of finish ops or None
58
+ """
59
+ pass
60
+
46
61
def _add_accumulator (self , block , name , param , dtype = None , fill_value = 0.0 ):
47
62
"""Utility function to add an accumulator for a parameter
48
63
@@ -137,15 +152,17 @@ def create_optimization_pass(self, parameters_and_grads, loss):
137
152
parameters_and_grads: a list of (variable, gradient) pair to update.
138
153
139
154
Returns:
140
- optmization_op_list: a list of optimization operator that will update
141
- parameter using gradient.
155
+ return_op_list: a list of operators that will complete one step of
156
+ optimization. This will include parameter update ops, global step
157
+ update ops and any other custom ops required by subclasses to manage
158
+ their internal state.
142
159
"""
143
160
# This is a default implementation of create_optimization_pass that
144
161
# can be shared by most optimizers. This implementation assumes that
145
162
# the subclass will implement the _append_optimize_op method and the
146
163
# _initialize_tensors method. The subclass can extend the
147
164
# _create_accumulators method if it needs to create accumulators
148
- # for parameters.
165
+ # for parameters and extend _finish_update method to add custom ops .
149
166
150
167
# Create any accumulators
151
168
self ._create_accumulators (loss .block ,
@@ -160,7 +177,17 @@ def create_optimization_pass(self, parameters_and_grads, loss):
160
177
param_and_grad )
161
178
optimize_ops .append (optimize_op )
162
179
163
- return optimize_ops
180
+ # Returned list of ops can include more ops in addition
181
+ # to optimization ops
182
+ return_ops = optimize_ops
183
+
184
+ # Get custom finish ops for subclasses
185
+ # FIXME: Need to fix this once we figure out how to handle dependencies
186
+ finish_ops = self ._finish_update (loss .block )
187
+ if finish_ops is not None :
188
+ return_ops += finish_ops
189
+
190
+ return return_ops
164
191
165
192
def minimize (self , loss , parameter_list = None , no_grad_set = None ):
166
193
"""Add operations to minimize `loss` by updating `parameter_list`.
@@ -329,3 +356,124 @@ def _append_optimize_op(self, block, param_and_grad):
329
356
attrs = {"epsilon" : self ._epsilon })
330
357
331
358
return adagrad_op
359
+
360
+
361
+ class AdamOptimizer (Optimizer ):
362
+ """Implements the Adam Optimizer
363
+ """
364
+ _moment1_acc_str = "moment1"
365
+ _moment2_acc_str = "moment2"
366
+
367
+ def __init__ (self ,
368
+ learning_rate = 0.001 ,
369
+ beta1 = 0.9 ,
370
+ beta2 = 0.999 ,
371
+ epsilon = 1e-8 ):
372
+ assert learning_rate is not None
373
+ assert beta1 is not None
374
+ assert beta2 is not None
375
+ assert epsilon is not None
376
+ super (AdamOptimizer , self ).__init__ ()
377
+ self .type = "adam"
378
+ self ._learning_rate = learning_rate
379
+ self ._beta1 = beta1
380
+ self ._beta2 = beta2
381
+ self ._epsilon = epsilon
382
+
383
+ def _initialize_tensors (self , block ):
384
+ assert isinstance (block , framework .Block )
385
+ lr_shape = [1 ]
386
+ # create a variable for learning_rate
387
+ self ._lr = block .create_var (
388
+ dtype = "float32" , shape = lr_shape , lod_level = 0 )
389
+
390
+ # create an op to init the learning_rate
391
+ # FIXME: Fix when Initialization design has been implemented
392
+ # https://github.com/PaddlePaddle/Paddle/pull/4852
393
+ block .append_op (
394
+ type = "fill_constant" ,
395
+ outputs = {"Out" : self ._lr },
396
+ attrs = {"shape" : lr_shape ,
397
+ "value" : self ._learning_rate })
398
+
399
+ def _create_accumulators (self , block , parameters ):
400
+ assert isinstance (block , framework .Block )
401
+
402
+ global_block = block .program .global_block ()
403
+ # Create beta1 and beta2 power tensors
404
+ beta_shape = [1 ]
405
+ # Create variables for beta1 and beta2 powers
406
+ self ._beta1_pow_acc = global_block .create_var (
407
+ dtype = "float32" , shape = beta_shape , lod_level = 0 )
408
+ self ._beta2_pow_acc = global_block .create_var (
409
+ dtype = "float32" , shape = beta_shape , lod_level = 0 )
410
+
411
+ # Initialize beta1 and beta2 power accumulators
412
+ # FIXME: Fix when Initialization design has been implemented
413
+ # https://github.com/PaddlePaddle/Paddle/pull/4852
414
+ global_block .append_op (
415
+ type = "fill_constant" ,
416
+ outputs = {"Out" : self ._beta1_pow_acc },
417
+ attrs = {"shape" : beta_shape ,
418
+ "value" : self ._beta1 })
419
+ global_block .append_op (
420
+ type = "fill_constant" ,
421
+ outputs = {"Out" : self ._beta2_pow_acc },
422
+ attrs = {"shape" : beta_shape ,
423
+ "value" : self ._beta2 })
424
+
425
+ # Create accumulator tensors for first and second moments
426
+ for p in parameters :
427
+ self ._add_accumulator (block , self ._moment1_acc_str , p , 'float32' )
428
+ self ._add_accumulator (block , self ._moment2_acc_str , p , 'float32' )
429
+
430
+ def _append_optimize_op (self , block , param_and_grad ):
431
+ assert isinstance (block , framework .Block )
432
+
433
+ moment1 = self ._get_accumulator (self ._moment1_acc_str ,
434
+ param_and_grad [0 ])
435
+ moment2 = self ._get_accumulator (self ._moment2_acc_str ,
436
+ param_and_grad [0 ])
437
+ # create the momentum optimize op
438
+ adam_op = block .append_op (
439
+ type = self .type ,
440
+ inputs = {
441
+ "Param" : param_and_grad [0 ],
442
+ "Grad" : param_and_grad [1 ],
443
+ "LearningRate" : self ._lr ,
444
+ "Moment1" : moment1 ,
445
+ "Moment2" : moment2 ,
446
+ "Beta1Pow" : self ._beta1_pow_acc ,
447
+ "Beta2Pow" : self ._beta2_pow_acc
448
+ },
449
+ outputs = {
450
+ "ParamOut" : param_and_grad [0 ],
451
+ "Moment1Out" : moment1 ,
452
+ "Moment2Out" : moment2
453
+ },
454
+ attrs = {
455
+ "beta1" : self ._beta1 ,
456
+ "beta2" : self ._beta2 ,
457
+ "epsilon" : self ._epsilon
458
+ })
459
+
460
+ return adam_op
461
+
462
+ def _finish_update (self , block ):
463
+ """Update Beta1 and Beta2 Power accumulators
464
+ """
465
+ assert isinstance (block , framework .Block )
466
+ global_block = block .program .global_block ()
467
+ scale_beta1 = global_block .append_op (
468
+ type = "scale" ,
469
+ inputs = {"X" : self ._beta1_pow_acc },
470
+ outputs = {"Out" : self ._beta1_pow_acc },
471
+ attrs = {"scale" : self ._beta1 })
472
+
473
+ scale_beta2 = global_block .append_op (
474
+ type = "scale" ,
475
+ inputs = {"X" : self ._beta2_pow_acc },
476
+ outputs = {"Out" : self ._beta2_pow_acc },
477
+ attrs = {"scale" : self ._beta2 })
478
+
479
+ return [scale_beta1 , scale_beta2 ]
0 commit comments