26
26
from contextlib import contextmanager
27
27
28
28
__all__ = [
29
- 'SGD' , 'Momentum' , 'Adagrad' , 'Adam' , 'Adamax' , 'DecayedAdagrad' ,
29
+ 'SGD' , 'Momentum' , 'Adagrad' , 'Adam' , 'Adamax' , 'DecayedAdagrad' , 'Ftrl' ,
30
30
'SGDOptimizer' , 'MomentumOptimizer' , 'AdagradOptimizer' , 'AdamOptimizer' ,
31
31
'AdamaxOptimizer' , 'DecayedAdagradOptimizer' , 'RMSPropOptimizer' ,
32
- 'Adadelta' , 'ModelAverage' , 'Optimizer'
32
+ 'FtrlOptimizer' , ' Adadelta' , 'ModelAverage' , 'Optimizer'
33
33
]
34
34
35
35
@@ -628,7 +628,7 @@ class AdadeltaOptimizer(Optimizer):
628
628
E(dx_t^2) &= \\ rho * E(dx_{t-1}^2) + (1-\\ rho) * (-g*learning\\ _rate)^2
629
629
630
630
Args:
631
- learning_rate(float): global leraning rate
631
+ learning_rate(float): global learning rate
632
632
rho(float): rho in equation
633
633
epsilon(float): epsilon in equation
634
634
@@ -729,7 +729,7 @@ class RMSPropOptimizer(Optimizer):
729
729
730
730
731
731
Args:
732
- learning_rate(float): global leraning rate.
732
+ learning_rate(float): global learning rate.
733
733
rho(float): rho is :math: `\\ rho` in equation, set 0.95 by default.
734
734
epsilon(float): :math: `\\ epsilon` in equation is smoothing term to
735
735
avoid division by zero, set 1e-6 by default.
@@ -810,6 +810,113 @@ def _append_optimize_op(self, block, param_and_grad):
810
810
return rmsprop_op
811
811
812
812
813
+ class FtrlOptimizer (Optimizer ):
814
+ """
815
+ FTRL (Follow The Regularized Leader) Optimizer.
816
+
817
+ The paper that proposed Follow The Regularized Leader (FTRL):
818
+ (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
819
+
820
+ .. math::
821
+
822
+ &new\_accum = squared\_accum + grad^2
823
+
824
+ &if (lr\_power == -0.5):
825
+
826
+ &\quad linear\_accum += grad - \\ frac{\\ sqrt{new\_accum} - \\ sqrt{squared\_accum}}{learning\_rate * param}
827
+
828
+ &else:
829
+
830
+ &\quad linear\_accum += grad - \\ frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
831
+
832
+
833
+ &x = l1 * sign(linear\_accum) - linear\_accum
834
+
835
+ &if (lr\_power == -0.5):
836
+
837
+ &\quad y = \\ frac{\\ sqrt{new\_accum}}{learning\_rate} + (2 * l2)
838
+
839
+ &\quad pre\_shrink = \\ frac{x}{y}
840
+
841
+ &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
842
+
843
+ &else:
844
+
845
+ &\quad y = \\ frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
846
+
847
+ &\quad pre\_shrink = \\ frac{x}{y}
848
+
849
+ &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
850
+
851
+ &squared\_accum += grad^2
852
+
853
+ Args:
854
+ learning_rate (float|Variable): global learning rate.
855
+ l1 (float):
856
+ l2 (float):
857
+ lr_power (float):
858
+
859
+ Raises:
860
+ ValueError: If learning_rate, rho, epsilon, momentum are None.
861
+
862
+ Examples:
863
+ .. code-block:: python
864
+
865
+ optimizer = fluid.optimizer.Ftrl(0.0001)
866
+ _, params_grads = optimizer.minimize(cost)
867
+ """
868
+
869
+ _squared_acc_str = "squared"
870
+ _linear_acc_str = "linear"
871
+
872
+ def __init__ (self , learning_rate , l1 = 0.0 , l2 = 0.0 , lr_power = - 0.5 , ** kwargs ):
873
+ super (FtrlOptimizer , self ).__init__ (
874
+ learning_rate = learning_rate , ** kwargs )
875
+ if learning_rate is None :
876
+ raise ValueError ("learning_rate is not set." )
877
+
878
+ self .type = "ftrl"
879
+ self ._l1 = l1
880
+ self ._l2 = l2
881
+ self ._lr_power = lr_power
882
+
883
+ def _create_accumulators (self , block , parameters ):
884
+ if not isinstance (block , framework .Block ):
885
+ raise TypeError ("block is not instance of framework.Block." )
886
+
887
+ for p in parameters :
888
+ self ._add_accumulator (self ._squared_acc_str , p )
889
+ self ._add_accumulator (self ._linear_acc_str , p )
890
+
891
+ def _append_optimize_op (self , block , param_and_grad ):
892
+ if not isinstance (block , framework .Block ):
893
+ raise TypeError ("block is not instance of framework.Block." )
894
+
895
+ squared_acc = self ._get_accumulator (self ._squared_acc_str ,
896
+ param_and_grad [0 ])
897
+ linear_acc = self ._get_accumulator (self ._linear_acc_str ,
898
+ param_and_grad [0 ])
899
+ ftrl_op = block .append_op (
900
+ type = self .type ,
901
+ inputs = {
902
+ "Param" : param_and_grad [0 ],
903
+ "Grad" : param_and_grad [1 ],
904
+ "SquaredAccumulator" : squared_acc ,
905
+ "LinearAccumulator" : linear_acc ,
906
+ "LearningRate" : self ._create_param_lr (param_and_grad ),
907
+ },
908
+ outputs = {
909
+ "ParamOut" : param_and_grad [0 ],
910
+ "SquaredAccumOut" : squared_acc ,
911
+ "LinearAccumOut" : linear_acc
912
+ },
913
+ attrs = {"l1" : self ._l1 ,
914
+ "l2" : self ._l1 ,
915
+ "lr_power" : self ._lr_power })
916
+
917
+ return ftrl_op
918
+
919
+
813
920
# We short the class name, since users will use the optimizer with the package
814
921
# name. The sample code:
815
922
#
@@ -826,6 +933,7 @@ def _append_optimize_op(self, block, param_and_grad):
826
933
DecayedAdagrad = DecayedAdagradOptimizer
827
934
Adadelta = AdadeltaOptimizer
828
935
RMSProp = RMSPropOptimizer
936
+ Ftrl = FtrlOptimizer
829
937
830
938
831
939
class ModelAverage (Optimizer ):
0 commit comments