|
| 1 | +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +import paddle.fluid |
| 15 | +from paddle.fluid import framework as framework |
| 16 | + |
| 17 | +__all__ = ["extend_with_decoupled_weight_decay"] |
| 18 | + |
| 19 | + |
| 20 | +class DecoupledWeightDecay(object): |
| 21 | + def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): |
| 22 | + if not isinstance(coeff, float) and \ |
| 23 | + not isinstance(coeff, framework.Variable): |
| 24 | + raise TypeError("coeff should be float or Variable.") |
| 25 | + self._params_name = set() |
| 26 | + self._apply_decay_param_fun = apply_decay_param_fun |
| 27 | + self._coeff = coeff |
| 28 | + super(DecoupledWeightDecay, self).__init__(**kwargs) |
| 29 | + |
| 30 | + def _scale_parameters(self, params_and_grads): |
| 31 | + """ |
| 32 | + Adds weight decay ops. |
| 33 | + scaled_parameter = parameter * coeff |
| 34 | +
|
| 35 | + Args: |
| 36 | + params_and_grads: A list of (parameters, gradients) pairs, |
| 37 | + the parameters need to decay. |
| 38 | + Raises: |
| 39 | + Exception: The type of coeff and parameter is not consistent. |
| 40 | + """ |
| 41 | + if isinstance(self._coeff, float) and self._coeff == 0.0: |
| 42 | + return |
| 43 | + |
| 44 | + scaled_params = [] |
| 45 | + for param, grad in params_and_grads: |
| 46 | + # If no gradient then we don't need to do anything |
| 47 | + if grad is None: |
| 48 | + continue |
| 49 | + if self._apply_decay_param_fun is not None \ |
| 50 | + and not self._apply_decay_param_fun(param.name): |
| 51 | + continue |
| 52 | + |
| 53 | + if isinstance(self._coeff, float): |
| 54 | + assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \ |
| 55 | + "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype) |
| 56 | + else: |
| 57 | + assert self._coeff.dtype == param.dtype, \ |
| 58 | + "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) |
| 59 | + |
| 60 | + with param.block.program._optimized_guard( |
| 61 | + [param, grad]), framework.name_scope('weight decay'): |
| 62 | + assert param.name not in self._params_name |
| 63 | + scaled_params.append((param, grad, param * self._coeff)) |
| 64 | + self._params_name.add(param.name) |
| 65 | + return scaled_params |
| 66 | + |
| 67 | + def backward(self, **kargs): |
| 68 | + return super(DecoupledWeightDecay, self).backward(**kargs) |
| 69 | + |
| 70 | + def apply_optimize(self, **kargs): |
| 71 | + return super(DecoupledWeightDecay, self).apply_optimize(**kargs) |
| 72 | + |
| 73 | + def minimize(self, |
| 74 | + loss, |
| 75 | + startup_program=None, |
| 76 | + parameter_list=None, |
| 77 | + no_grad_set=None): |
| 78 | + params_grads = self.backward( |
| 79 | + loss=loss, |
| 80 | + startup_program=startup_program, |
| 81 | + parameter_list=parameter_list, |
| 82 | + no_grad_set=no_grad_set) |
| 83 | + scaled_params = self._scale_parameters(params_grads) |
| 84 | + for p_grad_sgrad in scaled_params: |
| 85 | + param, grad, scaled_param = p_grad_sgrad |
| 86 | + with param.block.program._optimized_guard( |
| 87 | + [param, grad]), framework.name_scope('weight decay'): |
| 88 | + updated_param = paddle.fluid.layers.elementwise_sub( |
| 89 | + x=param, y=scaled_param) |
| 90 | + paddle.fluid.layers.assign(input=updated_param, output=param) |
| 91 | + |
| 92 | + optimize_ops = self.apply_optimize( |
| 93 | + loss=loss, |
| 94 | + params_grads=params_grads, |
| 95 | + startup_program=startup_program) |
| 96 | + return optimize_ops, params_grads |
| 97 | + |
| 98 | + def __str__(self): |
| 99 | + return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) |
| 100 | + |
| 101 | + |
| 102 | +def extend_with_decoupled_weight_decay(base_optimizer): |
| 103 | + """ |
| 104 | + extend_with_decoupled_weight_decay is a decorator function, it returns an |
| 105 | + optimizer class with decoupled weight decay. The returned optimizer will |
| 106 | + apply weight decay on the optimized parameters with the parameters before |
| 107 | + optimization, i.e: new_parameter = optimized_parameter - parameter * coeff. |
| 108 | + The details of decoupled weight decay yplease refer to this |
| 109 | + `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_. |
| 110 | +
|
| 111 | + Args: |
| 112 | + base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer. |
| 113 | +
|
| 114 | + Returns: |
| 115 | + OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay. |
| 116 | +
|
| 117 | + Examples: |
| 118 | +
|
| 119 | + .. code-block:: python |
| 120 | +
|
| 121 | + AdamW = fluid.contrib.extend_with_decoupled_weight_decay( |
| 122 | + fluid.optimizer.Adam) |
| 123 | + optimizer = AdamW(learning_rate=0.1, |
| 124 | + weight_decay=0.01) |
| 125 | +
|
| 126 | + optimizer.minimize(cost) |
| 127 | + """ |
| 128 | + if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer): |
| 129 | + raise TypeError( |
| 130 | + "The input(base_optimizer) should be a derived class of Optimizer.") |
| 131 | + |
| 132 | + class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay, |
| 133 | + base_optimizer): |
| 134 | + """ |
| 135 | + OptimizerWithDecoupledWeightDecay is used to update the optimized parameters |
| 136 | + with the parameters before optimization. For more information, please refer: |
| 137 | + https://arxiv.org/pdf/1711.05101.pdf. |
| 138 | +
|
| 139 | + Args: |
| 140 | + weight_decay (float|Variable): The weight decay coefficient, it can be |
| 141 | + float or Variable. |
| 142 | + apply_decay_param_fun (function|None): If it is not None, |
| 143 | + only variables that makes apply_decay_param_fun(variable)==True |
| 144 | + will be updated. It only works when we want to specify variables. |
| 145 | + Default: None. |
| 146 | + """ |
| 147 | + |
| 148 | + def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs): |
| 149 | + super(OptimizerWithDecoupledWeightDecay, self).__init__( |
| 150 | + weight_decay, apply_decay_param_fun, **kwargs) |
| 151 | + |
| 152 | + return OptimizerWithDecoupledWeightDecay |
0 commit comments