Skip to content

Commit 53d1d0f

Browse files
authored
add LARS support (#10374)
1 parent dd55cc1 commit 53d1d0f

File tree

3 files changed

+59
-7
lines changed

3 files changed

+59
-7
lines changed

python/paddle/fluid/layers/learning_rate_scheduler.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,11 @@
2525
import ops
2626
import tensor
2727
from ..initializer import init_on_cpu
28+
from ..framework import default_main_program, Parameter
2829

2930
__all__ = [
3031
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
31-
'polynomial_decay', 'piecewise_decay', 'noam_decay'
32+
'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
3233
]
3334

3435

@@ -261,3 +262,41 @@ def piecewise_decay(boundaries, values):
261262
tensor.assign(last_value_var, lr)
262263

263264
return lr
265+
266+
267+
def append_LARS(params_grads, learning_rate, weight_decay):
268+
"""Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
269+
each layer.
270+
271+
```python
272+
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
273+
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
274+
```
275+
276+
Args:
277+
learning_rate: A learning rate Variable. This
278+
is the global learning rate for LARS.
279+
weight_decay: A Python `float` number.
280+
281+
Returns:
282+
The decayed learning rate
283+
"""
284+
285+
def _balanced_weight(param_norm, grad_norm):
286+
if weight_decay == 1.0:
287+
return grad_norm + param_norm
288+
else:
289+
return grad_norm + weight_decay * param_norm
290+
291+
for param, grad in params_grads:
292+
param_lr = param.optimize_attr['learning_rate']
293+
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
294+
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
295+
if type(param_lr) == float and param_lr == 1.0:
296+
decayed_lr = learning_rate * param_norm \
297+
/ _balanced_weight(param_norm, grad_norm)
298+
else:
299+
decayed_lr = learning_rate * param_lr * param_norm \
300+
/ _balanced_weight(param_norm, grad_norm)
301+
# set back param local learning rate
302+
param.optimize_attr['learning_rate'] = decayed_lr

python/paddle/fluid/optimizer.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414
import re
1515
from collections import defaultdict
16-
from paddle.fluid.framework import Program
16+
from paddle.fluid.framework import Program, Variable
1717
import framework
1818
import layers
1919
from backward import append_backward
@@ -41,7 +41,10 @@ class Optimizer(object):
4141
but need to use one of it's implementation.
4242
"""
4343

44-
def __init__(self, learning_rate, regularization=None):
44+
def __init__(self,
45+
learning_rate,
46+
regularization=None,
47+
LARS_weight_decay=0.0):
4548
if not isinstance(learning_rate, float) and \
4649
not isinstance(learning_rate, framework.Variable):
4750
raise TypeError("learning rate should be float or Variable")
@@ -61,6 +64,7 @@ def __init__(self, learning_rate, regularization=None):
6164
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
6265
self._accumulators = defaultdict(lambda: dict())
6366
self.helper = None
67+
self._LARS_weight_decay = LARS_weight_decay
6468

6569
def _create_global_learning_rate(self):
6670
lr = self.global_learning_rate()
@@ -100,10 +104,15 @@ def _create_param_lr(self, param_and_grad):
100104
# create learning rate variable for every parameter
101105
param = param_and_grad[0]
102106
param_lr = param.optimize_attr['learning_rate']
103-
if param_lr == 1.0:
104-
return self.global_learning_rate()
107+
if type(param_lr) == Variable:
108+
# param learning rate has been updated (LARS)
109+
print("returns updated param lr ", param_lr)
110+
return param_lr
105111
else:
106-
return self.global_learning_rate() * param_lr
112+
if param_lr == 1.0:
113+
return self.global_learning_rate()
114+
else:
115+
return self.global_learning_rate() * param_lr
107116

108117
def _create_accumulators(self, block, parameters):
109118
"""Create all accumulators needed by the parameters
@@ -210,6 +219,10 @@ def create_optimization_pass(self,
210219
self._create_accumulators(loss.block,
211220
[p[0] for p in parameters_and_grads])
212221
self._create_global_learning_rate()
222+
if self._LARS_weight_decay > 0.0:
223+
layers.append_LARS(parameters_and_grads,
224+
self.global_learning_rate(),
225+
self._LARS_weight_decay)
213226

214227
optimize_ops = []
215228
for param_and_grad in parameters_and_grads:

python/paddle/fluid/tests/book/test_recognize_digits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def train(nn_type,
9494

9595
test_program = fluid.default_main_program().clone(for_test=True)
9696

97-
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
97+
optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
9898
optimizer.minimize(avg_loss)
9999

100100
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

0 commit comments

Comments
 (0)