Skip to content

Commit ca341db

Browse files
committed
add FtrlOptimizer and it's doc
1 parent d734595 commit ca341db

File tree

1 file changed

+112
-4
lines changed

1 file changed

+112
-4
lines changed

python/paddle/fluid/optimizer.py

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@
2626
from contextlib import contextmanager
2727

2828
__all__ = [
29-
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
29+
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
3030
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
3131
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
32-
'Adadelta', 'ModelAverage', 'Optimizer'
32+
'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer'
3333
]
3434

3535

@@ -628,7 +628,7 @@ class AdadeltaOptimizer(Optimizer):
628628
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
629629
630630
Args:
631-
learning_rate(float): global leraning rate
631+
learning_rate(float): global learning rate
632632
rho(float): rho in equation
633633
epsilon(float): epsilon in equation
634634
@@ -729,7 +729,7 @@ class RMSPropOptimizer(Optimizer):
729729
730730
731731
Args:
732-
learning_rate(float): global leraning rate.
732+
learning_rate(float): global learning rate.
733733
rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
734734
epsilon(float): :math: `\\epsilon` in equation is smoothing term to
735735
avoid division by zero, set 1e-6 by default.
@@ -810,6 +810,113 @@ def _append_optimize_op(self, block, param_and_grad):
810810
return rmsprop_op
811811

812812

813+
class FtrlOptimizer(Optimizer):
814+
"""
815+
FTRL (Follow The Regularized Leader) Optimizer.
816+
817+
The paper that proposed Follow The Regularized Leader (FTRL):
818+
(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
819+
820+
.. math::
821+
822+
&new\_accum = squared\_accum + grad^2
823+
824+
&if (lr\_power == -0.5):
825+
826+
&\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
827+
828+
&else:
829+
830+
&\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
831+
832+
833+
&x = l1 * sign(linear\_accum) - linear\_accum
834+
835+
&if (lr\_power == -0.5):
836+
837+
&\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
838+
839+
&\quad pre\_shrink = \\frac{x}{y}
840+
841+
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
842+
843+
&else:
844+
845+
&\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
846+
847+
&\quad pre\_shrink = \\frac{x}{y}
848+
849+
&\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
850+
851+
&squared\_accum += grad^2
852+
853+
Args:
854+
learning_rate (float|Variable): global learning rate.
855+
l1 (float):
856+
l2 (float):
857+
lr_power (float):
858+
859+
Raises:
860+
ValueError: If learning_rate, rho, epsilon, momentum are None.
861+
862+
Examples:
863+
.. code-block:: python
864+
865+
optimizer = fluid.optimizer.Ftrl(0.0001)
866+
_, params_grads = optimizer.minimize(cost)
867+
"""
868+
869+
_squared_acc_str = "squared"
870+
_linear_acc_str = "linear"
871+
872+
def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
873+
super(FtrlOptimizer, self).__init__(
874+
learning_rate=learning_rate, **kwargs)
875+
if learning_rate is None:
876+
raise ValueError("learning_rate is not set.")
877+
878+
self.type = "ftrl"
879+
self._l1 = l1
880+
self._l2 = l2
881+
self._lr_power = lr_power
882+
883+
def _create_accumulators(self, block, parameters):
884+
if not isinstance(block, framework.Block):
885+
raise TypeError("block is not instance of framework.Block.")
886+
887+
for p in parameters:
888+
self._add_accumulator(self._squared_acc_str, p)
889+
self._add_accumulator(self._linear_acc_str, p)
890+
891+
def _append_optimize_op(self, block, param_and_grad):
892+
if not isinstance(block, framework.Block):
893+
raise TypeError("block is not instance of framework.Block.")
894+
895+
squared_acc = self._get_accumulator(self._squared_acc_str,
896+
param_and_grad[0])
897+
linear_acc = self._get_accumulator(self._linear_acc_str,
898+
param_and_grad[0])
899+
ftrl_op = block.append_op(
900+
type=self.type,
901+
inputs={
902+
"Param": param_and_grad[0],
903+
"Grad": param_and_grad[1],
904+
"SquaredAccumulator": squared_acc,
905+
"LinearAccumulator": linear_acc,
906+
"LearningRate": self._create_param_lr(param_and_grad),
907+
},
908+
outputs={
909+
"ParamOut": param_and_grad[0],
910+
"SquaredAccumOut": squared_acc,
911+
"LinearAccumOut": linear_acc
912+
},
913+
attrs={"l1": self._l1,
914+
"l2": self._l1,
915+
"lr_power": self._lr_power})
916+
917+
return ftrl_op
918+
919+
813920
# We short the class name, since users will use the optimizer with the package
814921
# name. The sample code:
815922
#
@@ -826,6 +933,7 @@ def _append_optimize_op(self, block, param_and_grad):
826933
DecayedAdagrad = DecayedAdagradOptimizer
827934
Adadelta = AdadeltaOptimizer
828935
RMSProp = RMSPropOptimizer
936+
Ftrl = FtrlOptimizer
829937

830938

831939
class ModelAverage(Optimizer):

0 commit comments

Comments
 (0)