Skip to content

Commit 30b7032

Browse files
authored
Expose RMSProp optimizer. (#9247)
* Add RMSProp optimizer warpper. * Follow comments.
1 parent 5008020 commit 30b7032

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

python/paddle/fluid/optimizer.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,123 @@ def _append_optimize_op(self, block, param_and_grad):
664664
return adadelta_op
665665

666666

667+
class RMSPropOptimizer(Optimizer):
668+
"""
669+
Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
670+
rate method. The original slides proposed RMSProp: Slide 29 of
671+
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
672+
673+
The original equation is as follows:
674+
675+
.. math::
676+
677+
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
678+
679+
w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
680+
681+
The first equation calculates moving average of the squared gradient for
682+
each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
683+
684+
In some cases, adding a momentum term :math: `\\beta` is beneficial.
685+
In our implementation, Nesterov momentum is used:
686+
687+
.. math::
688+
689+
r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
690+
691+
v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
692+
\\epsilon}} \\nabla Q_{i}(w)
693+
694+
w & = w - v(w, t)
695+
696+
where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95
697+
and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
698+
smoothing term to avoid division by zero, usually set somewhere in range
699+
from 1e-4 to 1e-8.
700+
701+
702+
Args:
703+
learning_rate(float): global leraning rate.
704+
rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
705+
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
706+
avoid division by zero, set 1e-6 by default.
707+
momentum(float): :math: `\\beta` in equation is the momentum term,
708+
set 0.0 by default.
709+
710+
Raises:
711+
ValueError: If learning_rate, rho, epsilon, momentum are None.
712+
713+
Examples:
714+
.. code-block:: python
715+
716+
optimizer = fluid.optimizer.RMSProp(0.0001)
717+
_, params_grads = optimizer.minimize(cost)
718+
"""
719+
720+
_momentum_acc_str = "momentum"
721+
_mean_square_acc_str = "mean_square"
722+
723+
def __init__(self,
724+
learning_rate,
725+
rho=0.95,
726+
epsilon=1.0e-6,
727+
momentum=0.0,
728+
**kwargs):
729+
super(RMSPropOptimizer, self).__init__(
730+
learning_rate=learning_rate, **kwargs)
731+
if learning_rate is None:
732+
raise ValueError("learning_rate is not set.")
733+
if rho is None:
734+
raise ValueError("rho is not set.")
735+
if epsilon is None:
736+
raise ValueError("epsilon is not set.")
737+
if momentum is None:
738+
raise ValueError("momentum is not set.")
739+
740+
self.type = "rmsprop"
741+
self._rho = rho
742+
self._epsilon = epsilon
743+
self._momentum = momentum
744+
745+
def _create_accumulators(self, block, parameters):
746+
if not isinstance(block, framework.Block):
747+
raise TypeError("block is not instance of framework.Block.")
748+
749+
for p in parameters:
750+
self._add_accumulator(self._momentum_acc_str, p)
751+
self._add_accumulator(self._mean_square_acc_str, p)
752+
753+
def _append_optimize_op(self, block, param_and_grad):
754+
if not isinstance(block, framework.Block):
755+
raise TypeError("block is not instance of framework.Block.")
756+
757+
momentum_acc = self._get_accumulator(self._momentum_acc_str,
758+
param_and_grad[0])
759+
mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
760+
param_and_grad[0])
761+
rmsprop_op = block.append_op(
762+
type=self.type,
763+
inputs={
764+
"Param": param_and_grad[0],
765+
"Grad": param_and_grad[1],
766+
"Moment": momentum_acc,
767+
"MeanSquare": mean_square_acc,
768+
"LearningRate": self._create_param_lr(param_and_grad),
769+
},
770+
outputs={
771+
"ParamOut": param_and_grad[0],
772+
"MomentOut": momentum_acc,
773+
"MeanSquareOut": mean_square_acc
774+
},
775+
attrs={
776+
"epsilon": self._epsilon,
777+
"decay": self._rho,
778+
"momentum": self._momentum
779+
})
780+
781+
return rmsprop_op
782+
783+
667784
# We short the class name, since users will use the optimizer with the package
668785
# name. The sample code:
669786
#
@@ -679,3 +796,4 @@ def _append_optimize_op(self, block, param_and_grad):
679796
Adamax = AdamaxOptimizer
680797
DecayedAdagrad = DecayedAdagradOptimizer
681798
Adadelta = AdadeltaOptimizer
799+
RMSProp = RMSPropOptimizer

0 commit comments

Comments
 (0)