[CHERR-PICK1.8]add base class LearningRateEpochDecay, and MultiStepDecay, StepDecay (#25277)

zhwesky2010 · web-flow · commit d171f373a333 · 2020-07-01T23:15:14.000+08:00
* CHERR-PICK1.8,add base class of LearningRateEpochDecay, and API: MultiStepDecay, and API: StepDecay,test=release/1.8

* fix unittest to add coverage,test=develop
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -23,7 +23,7 @@
 __all__ = [
     'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
     'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay', 'LinearLrWarmup',
-    'ReduceLROnPlateau'
+    'ReduceLROnPlateau', 'StepDecay', 'MultiStepDecay'
 ]
 
 
@@ -72,6 +72,8 @@ def step(self):
 
 class PiecewiseDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+    
     Piecewise decay scheduler.
 
     The algorithm can be described as the code below.
@@ -131,6 +133,8 @@ def step(self):
 
 class NaturalExpDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies natural exponential decay to the initial learning rate.
     
     The algorithm can be described as following.
@@ -183,7 +187,6 @@ class NaturalExpDecay(LearningRateDecay):
                             staircase=True),
                         parameter_list=emb.parameters())
 
-
     """
 
     def __init__(self,
@@ -213,6 +216,8 @@ def step(self):
 
 class ExponentialDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies exponential decay to the learning rate.
 
     The algorithm can be described as following.
@@ -293,6 +298,8 @@ def step(self):
 
 class InverseTimeDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies inverse time decay to the initial learning rate.
 
     The algorithm can be described as following.
@@ -369,6 +376,8 @@ def step(self):
 
 class PolynomialDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies polynomial decay to the initial learning rate.
 
     The algorithm can be described as following.
@@ -461,6 +470,8 @@ def step(self):
 
 class CosineDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies cosine decay to the learning rate.
 
     The algorithm can be described as following.
@@ -517,6 +528,8 @@ def step(self):
 
 class NoamDecay(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Applies Noam decay to the initial learning rate. 
 
     The algorithm can be described as following.
@@ -582,6 +595,8 @@ def step(self):
 
 class LinearLrWarmup(LearningRateDecay):
     """
+    :api_attr: imperative
+
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
     
@@ -670,6 +685,8 @@ def step(self):
 
 class ReduceLROnPlateau(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate 
     by 2 to 10 times once model performance has no longer improvement.
 
@@ -774,7 +791,6 @@ def __init__(self,
             raise ValueError('threshold mode ' + threshold_mode +
                              ' is unknown!')
         self.threshold_mode = threshold_mode
-
         check_type(learning_rate, 'learning_rate', (float, int, Variable),
                    'ReduceLROnPlateau')
         if isinstance(learning_rate, (float, int)):
@@ -856,3 +872,217 @@ def _is_better(self, current, best):
 
         else:
             return current > best + self.threshold
+
+
+class _LearningRateEpochDecay(LearningRateDecay):
+    """
+    :api_attr: imperative
+
+    Base class of learning rate decay, which is updated each epoch.
+    
+    Define the common interface of an _LearningRateEpochDecay.
+    User should not use this class directly,
+    but need to use one of it's implementation. And invoke method: `epoch()` each epoch.
+    """
+
+    def __init__(self, learning_rate, dtype=None):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' must be 'float, int', but received %s."
+                % type(learning_rate))
+        if learning_rate >= 1.0:
+            raise ValueError("The initial learning rate")
+
+        self.base_lr = float(learning_rate)
+
+        self.epoch_num = -1
+        if dtype is None:
+            self.dtype = "float32"
+        self.learning_rate = self.create_lr_var(self.base_lr)
+
+        self.epoch()
+
+    def __call__(self):
+        """ 
+        Return last computed learning rate on current epoch.
+        """
+        return self.learning_rate
+
+    def epoch(self, epoch=None):
+        """
+        compueted learning_rate and update it when invoked.
+        """
+        if epoch is None:
+            self.epoch_num += 1
+        else:
+            self.epoch_num = epoch
+
+        self.learning_rate = self.get_lr()
+        if isinstance(self.learning_rate, float):
+            self.learning_rate = self.create_lr_var(self.learning_rate)
+
+    def get_lr(self):
+        raise NotImplementedError
+
+
+class StepDecay(_LearningRateEpochDecay):
+    """
+    :api_attr: imperative
+
+    Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        step_size = 30
+        decay_rate = 0.1
+
+        learning_rate = 0.5     if epoch < 30
+        learning_rate = 0.05    if 30 <= epoch < 60
+        learning_rate = 0.005   if 60 <= epoch < 90
+        ...
+
+    Parameters:
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
+        step_size (int): Period of learning rate decay..
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+            It should be less than 1.0. Default: 0.1.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import numpy as np
+            with fluid.dygraph.guard():
+                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+                linear = fluid.dygraph.Linear(10, 10)
+                input = fluid.dygraph.to_variable(x)
+                scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
+                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
+
+                for epoch in range(9):
+                    for batch_id in range(5):
+                        out = linear(input)
+                        loss = fluid.layers.reduce_mean(out)
+                        adam.minimize(loss)  
+                    scheduler.epoch()
+
+                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
+                    # epoch:0, current lr is 0.5
+                    # epoch:1, current lr is 0.5
+                    # epoch:2, current lr is 0.5
+                    # epoch:3, current lr is 0.05
+                    # epoch:4, current lr is 0.05
+                    # epoch:5, current lr is 0.05
+                    # epoch:6, current lr is 0.005
+                    # epoch:7, current lr is 0.005
+                    # epoch:8, current lr is 0.005
+
+    """
+
+    def __init__(self, learning_rate, step_size, decay_rate=0.1):
+        if not isinstance(step_size, int):
+            raise TypeError(
+                "The type of 'step_size' must be 'int', but received %s." %
+                type(step_size))
+        if decay_rate >= 1.0:
+            raise ValueError('decay_rate should be < 1.0.')
+
+        self.step_size = step_size
+        self.decay_rate = decay_rate
+        super(StepDecay, self).__init__(learning_rate)
+
+    def get_lr(self):
+        decay_rate = self.create_lr_var(self.decay_rate)
+        i = self.epoch_num // self.step_size
+        return self.base_lr * (decay_rate**i)
+
+
+class MultiStepDecay(_LearningRateEpochDecay):
+    """
+    :api_attr: imperative
+
+    Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        milestones = [30, 50]
+        decay_rate = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+
+    Parameters:
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number. If it
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+            It should be less than 1.0. Default: 0.1.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import numpy as np
+            with fluid.dygraph.guard():
+                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+                linear = fluid.dygraph.Linear(10, 10)
+                input = fluid.dygraph.to_variable(x)
+                scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5])
+                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
+
+                for epoch in range(6):
+                    for batch_id in range(5):
+                        out = linear(input)
+                        loss = fluid.layers.reduce_mean(out)
+                        adam.minimize(loss)
+                    scheduler.epoch()
+
+                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
+                    # epoch:0, current lr is 0.5
+                    # epoch:1, current lr is 0.5
+                    # epoch:2, current lr is 0.5
+                    # epoch:3, current lr is 0.05
+                    # epoch:4, current lr is 0.05
+                    # epoch:5, current lr is 0.005
+
+    """
+
+    def __init__(self, learning_rate, milestones, decay_rate=0.1):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if decay_rate >= 1.0:
+            raise ValueError('decay_rate should be < 1.0.')
+
+        self.milestones = milestones
+        self.decay_rate = decay_rate
+        super(MultiStepDecay, self).__init__(learning_rate)
+
+    def get_lr(self):
+        decay_rate = self.create_lr_var(self.decay_rate)
+        for i in range(len(self.milestones)):
+            if self.epoch_num < self.milestones[i]:
+                return self.base_lr * (decay_rate**i)
+
+        return self.base_lr * (decay_rate**len(self.milestones))
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -498,7 +498,6 @@ def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
     Returns:
         Variable: Warm-up learning rate with the same data type as learning_rate.
     
-    
     Examples:
     
     .. code-block:: python
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py