@@ -664,6 +664,123 @@ def _append_optimize_op(self, block, param_and_grad):
664
664
return adadelta_op
665
665
666
666
667
+ class RMSPropOptimizer (Optimizer ):
668
+ """
669
+ Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
670
+ rate method. The original slides proposed RMSProp: Slide 29 of
671
+ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
672
+
673
+ The original equation is as follows:
674
+
675
+ .. math::
676
+
677
+ r(w, t) & = \\ rho r(w, t-1) + (1 - \\ rho)(\\ nabla Q_{i}(w))^2 \\ \\
678
+
679
+ w & = w - \\ frac{\\ eta} {\\ sqrt{r(w,t) + \\ epsilon}} \\ nabla Q_{i}(w)
680
+
681
+ The first equation calculates moving average of the squared gradient for
682
+ each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
683
+
684
+ In some cases, adding a momentum term :math: `\\ beta` is beneficial.
685
+ In our implementation, Nesterov momentum is used:
686
+
687
+ .. math::
688
+
689
+ r(w, t) & = \\ rho r(w, t-1) + (1 - \\ rho)(\\ nabla Q_{i}(w))^2 \\ \\
690
+
691
+ v(w, t) & = \\ beta v(w, t-1) + \\ frac{\\ eta} {\\ sqrt{v(w,t) +
692
+ \\ epsilon}} \\ nabla Q_{i}(w)
693
+
694
+ w & = w - v(w, t)
695
+
696
+ where, :math: `\\ rho` is a hyperparameter and typical values are 0.9, 0.95
697
+ and so on. :math: `beta` is the momentum term. :math: `\\ epsilon` is a
698
+ smoothing term to avoid division by zero, usually set somewhere in range
699
+ from 1e-4 to 1e-8.
700
+
701
+
702
+ Args:
703
+ learning_rate(float): global leraning rate.
704
+ rho(float): rho is :math: `\\ rho` in equation, set 0.95 by default.
705
+ epsilon(float): :math: `\\ epsilon` in equation is smoothing term to
706
+ avoid division by zero, set 1e-6 by default.
707
+ momentum(float): :math: `\\ beta` in equation is the momentum term,
708
+ set 0.0 by default.
709
+
710
+ Raises:
711
+ ValueError: If learning_rate, rho, epsilon, momentum are None.
712
+
713
+ Examples:
714
+ .. code-block:: python
715
+
716
+ optimizer = fluid.optimizer.RMSProp(0.0001)
717
+ _, params_grads = optimizer.minimize(cost)
718
+ """
719
+
720
+ _momentum_acc_str = "momentum"
721
+ _mean_square_acc_str = "mean_square"
722
+
723
+ def __init__ (self ,
724
+ learning_rate ,
725
+ rho = 0.95 ,
726
+ epsilon = 1.0e-6 ,
727
+ momentum = 0.0 ,
728
+ ** kwargs ):
729
+ super (RMSPropOptimizer , self ).__init__ (
730
+ learning_rate = learning_rate , ** kwargs )
731
+ if learning_rate is None :
732
+ raise ValueError ("learning_rate is not set." )
733
+ if rho is None :
734
+ raise ValueError ("rho is not set." )
735
+ if epsilon is None :
736
+ raise ValueError ("epsilon is not set." )
737
+ if momentum is None :
738
+ raise ValueError ("momentum is not set." )
739
+
740
+ self .type = "rmsprop"
741
+ self ._rho = rho
742
+ self ._epsilon = epsilon
743
+ self ._momentum = momentum
744
+
745
+ def _create_accumulators (self , block , parameters ):
746
+ if not isinstance (block , framework .Block ):
747
+ raise TypeError ("block is not instance of framework.Block." )
748
+
749
+ for p in parameters :
750
+ self ._add_accumulator (self ._momentum_acc_str , p )
751
+ self ._add_accumulator (self ._mean_square_acc_str , p )
752
+
753
+ def _append_optimize_op (self , block , param_and_grad ):
754
+ if not isinstance (block , framework .Block ):
755
+ raise TypeError ("block is not instance of framework.Block." )
756
+
757
+ momentum_acc = self ._get_accumulator (self ._momentum_acc_str ,
758
+ param_and_grad [0 ])
759
+ mean_square_acc = self ._get_accumulator (self ._mean_square_acc_str ,
760
+ param_and_grad [0 ])
761
+ rmsprop_op = block .append_op (
762
+ type = self .type ,
763
+ inputs = {
764
+ "Param" : param_and_grad [0 ],
765
+ "Grad" : param_and_grad [1 ],
766
+ "Moment" : momentum_acc ,
767
+ "MeanSquare" : mean_square_acc ,
768
+ "LearningRate" : self ._create_param_lr (param_and_grad ),
769
+ },
770
+ outputs = {
771
+ "ParamOut" : param_and_grad [0 ],
772
+ "MomentOut" : momentum_acc ,
773
+ "MeanSquareOut" : mean_square_acc
774
+ },
775
+ attrs = {
776
+ "epsilon" : self ._epsilon ,
777
+ "decay" : self ._rho ,
778
+ "momentum" : self ._momentum
779
+ })
780
+
781
+ return rmsprop_op
782
+
783
+
667
784
# We short the class name, since users will use the optimizer with the package
668
785
# name. The sample code:
669
786
#
@@ -679,3 +796,4 @@ def _append_optimize_op(self, block, param_and_grad):
679
796
Adamax = AdamaxOptimizer
680
797
DecayedAdagrad = DecayedAdagradOptimizer
681
798
Adadelta = AdadeltaOptimizer
799
+ RMSProp = RMSPropOptimizer
0 commit comments