24
24
from regularizer import append_regularization_ops
25
25
from clip import append_gradient_clip_ops , error_clip_callback
26
26
27
- __all__ = ['SGD' , 'Momentum' , 'Adagrad' , 'Adam' , 'Adamax' , 'DecayedAdagrad' ]
27
+ __all__ = [
28
+ 'SGD' , 'Momentum' , 'Adagrad' , 'Adam' , 'Adamax' , 'DecayedAdagrad' , 'Adadelta'
29
+ ]
28
30
29
31
30
32
class Optimizer (object ):
@@ -580,6 +582,88 @@ def _append_optimize_op(self, block, param_and_grad):
580
582
return decayed_adagrad_op
581
583
582
584
585
+ class AdadeltaOptimizer (Optimizer ):
586
+ """
587
+ **Adadelta Optimizer**
588
+ Simple Adadelta optimizer with average squared grad state and
589
+ average squared update state.
590
+ The details of adadelta please refer to this
591
+ `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
592
+ <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
593
+
594
+ .. math::
595
+
596
+ E(g_t^2) &= \\ rho * E(g_{t-1}^2) + (1-\\ rho) * g^2 \\ \\
597
+ learning\\ _rate &= sqrt( ( E(dx_{t-1}^2) + \\ epsilon ) / ( \\
598
+ E(g_t^2) + \\ epsilon ) ) \\ \\
599
+ E(dx_t^2) &= \\ rho * E(dx_{t-1}^2) + (1-\\ rho) * (-g*learning\\ _rate)^2
600
+
601
+ Args:
602
+ learning_rate(float): global leraning rate
603
+ rho(float): rho in equation
604
+ epsilon(float): epsilon in equation
605
+
606
+ Examples:
607
+ .. code-block:: python
608
+
609
+ optimizer = fluid.optimizer.Adadelta(
610
+ learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
611
+ _, params_grads = optimizer.minimize(cost)
612
+ """
613
+
614
+ _avg_squared_grad_acc_str = "_avg_squared_grad"
615
+ _avg_squared_update_acc_str = "_avg_squared_update"
616
+
617
+ def __init__ (self , learning_rate , epsilon = 1.0e-6 , rho = 0.95 , ** kwargs ):
618
+ if learning_rate is None :
619
+ raise ValueError ("learning_rate is not set." )
620
+ if epsilon is None :
621
+ raise ValueError ("epsilon is not set." )
622
+ if rho is None :
623
+ raise ValueError ("rho is not set." )
624
+ super (AdadeltaOptimizer , self ).__init__ (
625
+ learning_rate = learning_rate , ** kwargs )
626
+ self .type = "adadelta"
627
+ self ._epsilon = epsilon
628
+ self ._rho = rho
629
+
630
+ def _create_accumulators (self , block , parameters ):
631
+ if not isinstance (block , framework .Block ):
632
+ raise TypeError ("block is not instance of framework.Block." )
633
+
634
+ for p in parameters :
635
+ self ._add_accumulator (self ._avg_squared_grad_acc_str , p )
636
+ self ._add_accumulator (self ._avg_squared_update_acc_str , p )
637
+
638
+ def _append_optimize_op (self , block , param_and_grad ):
639
+ if not isinstance (block , framework .Block ):
640
+ raise TypeError ("block is not instance of framework.Block." )
641
+
642
+ avg_squared_grad_acc = self ._get_accumulator (
643
+ self ._avg_squared_grad_acc_str , param_and_grad [0 ])
644
+ avg_squared_update_acc = self ._get_accumulator (
645
+ self ._avg_squared_update_acc_str , param_and_grad [0 ])
646
+
647
+ # Create the adadelta optimizer op
648
+ adadelta_op = block .append_op (
649
+ type = self .type ,
650
+ inputs = {
651
+ "Param" : param_and_grad [0 ],
652
+ "Grad" : param_and_grad [1 ],
653
+ "AvgSquaredGrad" : avg_squared_grad_acc ,
654
+ "AvgSquaredUpdate" : avg_squared_update_acc
655
+ },
656
+ outputs = {
657
+ "ParamOut" : param_and_grad [0 ],
658
+ "AvgSquaredGradOut" : avg_squared_grad_acc ,
659
+ "AvgSquaredUpdateOut" : avg_squared_update_acc
660
+ },
661
+ attrs = {"epsilon" : self ._epsilon ,
662
+ "rho" : self ._rho })
663
+
664
+ return adadelta_op
665
+
666
+
583
667
# We short the class name, since users will use the optimizer with the package
584
668
# name. The sample code:
585
669
#
@@ -594,3 +678,4 @@ def _append_optimize_op(self, block, param_and_grad):
594
678
Adam = AdamOptimizer
595
679
Adamax = AdamaxOptimizer
596
680
DecayedAdagrad = DecayedAdagradOptimizer
681
+ Adadelta = AdadeltaOptimizer
0 commit comments