Skip to content

Commit bb80dae

Browse files
author
chengduo
authored
Add DecoupledWeightDecay (#16427)
* Add DecoupledWeightDecay
1 parent ea6e565 commit bb80dae

File tree

7 files changed

+402
-37
lines changed

7 files changed

+402
-37
lines changed

paddle/fluid/API.spec

Lines changed: 13 additions & 0 deletions
Large diffs are not rendered by default.

python/paddle/fluid/contrib/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
from .slim import *
3131
from . import utils
3232
from .utils import *
33+
from . import extend_optimizer
34+
from .extend_optimizer import *
3335

3436
__all__ = []
3537
__all__ += decoder.__all__
@@ -40,3 +42,4 @@
4042
__all__ += reader.__all__
4143
__all__ += slim.__all__
4244
__all__ += utils.__all__
45+
__all__ += extend_optimizer.__all__
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import print_function
16+
from . import extend_optimizer_with_weight_decay
17+
from .extend_optimizer_with_weight_decay import *
18+
19+
__all__ = []
20+
__all__ += extend_optimizer_with_weight_decay.__all__
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import paddle.fluid
15+
from paddle.fluid import framework as framework
16+
17+
__all__ = ["extend_with_decoupled_weight_decay"]
18+
19+
20+
class DecoupledWeightDecay(object):
21+
def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
22+
if not isinstance(coeff, float) and \
23+
not isinstance(coeff, framework.Variable):
24+
raise TypeError("coeff should be float or Variable.")
25+
self._params_name = set()
26+
self._apply_decay_param_fun = apply_decay_param_fun
27+
self._coeff = coeff
28+
super(DecoupledWeightDecay, self).__init__(**kwargs)
29+
30+
def _scale_parameters(self, params_and_grads):
31+
"""
32+
Adds weight decay ops.
33+
scaled_parameter = parameter * coeff
34+
35+
Args:
36+
params_and_grads: A list of (parameters, gradients) pairs,
37+
the parameters need to decay.
38+
Raises:
39+
Exception: The type of coeff and parameter is not consistent.
40+
"""
41+
if isinstance(self._coeff, float) and self._coeff == 0.0:
42+
return
43+
44+
scaled_params = []
45+
for param, grad in params_and_grads:
46+
# If no gradient then we don't need to do anything
47+
if grad is None:
48+
continue
49+
if self._apply_decay_param_fun is not None \
50+
and not self._apply_decay_param_fun(param.name):
51+
continue
52+
53+
if isinstance(self._coeff, float):
54+
assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
55+
"the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
56+
else:
57+
assert self._coeff.dtype == param.dtype, \
58+
"the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
59+
60+
with param.block.program._optimized_guard(
61+
[param, grad]), framework.name_scope('weight decay'):
62+
assert param.name not in self._params_name
63+
scaled_params.append((param, grad, param * self._coeff))
64+
self._params_name.add(param.name)
65+
return scaled_params
66+
67+
def backward(self, **kargs):
68+
return super(DecoupledWeightDecay, self).backward(**kargs)
69+
70+
def apply_optimize(self, **kargs):
71+
return super(DecoupledWeightDecay, self).apply_optimize(**kargs)
72+
73+
def minimize(self,
74+
loss,
75+
startup_program=None,
76+
parameter_list=None,
77+
no_grad_set=None):
78+
params_grads = self.backward(
79+
loss=loss,
80+
startup_program=startup_program,
81+
parameter_list=parameter_list,
82+
no_grad_set=no_grad_set)
83+
scaled_params = self._scale_parameters(params_grads)
84+
for p_grad_sgrad in scaled_params:
85+
param, grad, scaled_param = p_grad_sgrad
86+
with param.block.program._optimized_guard(
87+
[param, grad]), framework.name_scope('weight decay'):
88+
updated_param = paddle.fluid.layers.elementwise_sub(
89+
x=param, y=scaled_param)
90+
paddle.fluid.layers.assign(input=updated_param, output=param)
91+
92+
optimize_ops = self.apply_optimize(
93+
loss=loss,
94+
params_grads=params_grads,
95+
startup_program=startup_program)
96+
return optimize_ops, params_grads
97+
98+
def __str__(self):
99+
return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
100+
101+
102+
def extend_with_decoupled_weight_decay(base_optimizer):
103+
"""
104+
extend_with_decoupled_weight_decay is a decorator function, it returns an
105+
optimizer class with decoupled weight decay. The returned optimizer will
106+
apply weight decay on the optimized parameters with the parameters before
107+
optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
108+
The details of decoupled weight decay yplease refer to this
109+
`DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
110+
111+
Args:
112+
base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
113+
114+
Returns:
115+
OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
116+
117+
Examples:
118+
119+
.. code-block:: python
120+
121+
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
122+
fluid.optimizer.Adam)
123+
optimizer = AdamW(learning_rate=0.1,
124+
weight_decay=0.01)
125+
126+
optimizer.minimize(cost)
127+
"""
128+
if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
129+
raise TypeError(
130+
"The input(base_optimizer) should be a derived class of Optimizer.")
131+
132+
class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay,
133+
base_optimizer):
134+
"""
135+
OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
136+
with the parameters before optimization. For more information, please refer:
137+
https://arxiv.org/pdf/1711.05101.pdf.
138+
139+
Args:
140+
weight_decay (float|Variable): The weight decay coefficient, it can be
141+
float or Variable.
142+
apply_decay_param_fun (function|None): If it is not None,
143+
only variables that makes apply_decay_param_fun(variable)==True
144+
will be updated. It only works when we want to specify variables.
145+
Default: None.
146+
"""
147+
148+
def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
149+
super(OptimizerWithDecoupledWeightDecay, self).__init__(
150+
weight_decay, apply_decay_param_fun, **kwargs)
151+
152+
return OptimizerWithDecoupledWeightDecay
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import print_function
16+
17+
import unittest
18+
from functools import partial
19+
import numpy as np
20+
import paddle
21+
import paddle.fluid as fluid
22+
import contextlib
23+
24+
25+
def get_places():
26+
places = [fluid.CPUPlace()]
27+
if fluid.core.is_compiled_with_cuda():
28+
places.append(fluid.CUDAPlace(0))
29+
return places
30+
31+
32+
@contextlib.contextmanager
33+
def prog_scope_guard(main_prog, startup_prog):
34+
scope = fluid.core.Scope()
35+
with fluid.unique_name.guard():
36+
with fluid.scope_guard(scope):
37+
with fluid.program_guard(main_prog, startup_prog):
38+
yield
39+
40+
41+
def bow_net(data,
42+
label,
43+
dict_dim,
44+
is_sparse=False,
45+
emb_dim=128,
46+
hid_dim=128,
47+
hid_dim2=96,
48+
class_dim=2):
49+
"""
50+
BOW net
51+
This model is from https://github.com/PaddlePaddle/models:
52+
fluid/PaddleNLP/text_classification/nets.py
53+
"""
54+
emb = fluid.layers.embedding(
55+
input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
56+
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
57+
bow_tanh = fluid.layers.tanh(bow)
58+
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
59+
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
60+
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
61+
cost = fluid.layers.cross_entropy(input=prediction, label=label)
62+
avg_cost = fluid.layers.mean(x=cost)
63+
64+
return avg_cost
65+
66+
67+
class TestWeightDecay(unittest.TestCase):
68+
def setUp(self):
69+
self.word_dict = paddle.dataset.imdb.word_dict()
70+
reader = paddle.batch(
71+
paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
72+
self.train_data = [next(reader) for _ in range(5)]
73+
self.learning_rate = .5
74+
75+
def run_program(self, place, feed_list):
76+
exe = fluid.Executor(place)
77+
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
78+
exe.run(fluid.default_startup_program())
79+
80+
main_prog = fluid.default_main_program()
81+
param_list = [var.name for var in main_prog.block(0).all_parameters()]
82+
83+
param_sum = []
84+
for data in self.train_data:
85+
out = exe.run(main_prog,
86+
feed=feeder.feed(data),
87+
fetch_list=param_list)
88+
p_sum = 0
89+
for v in out:
90+
p_sum += np.sum(np.abs(v))
91+
param_sum.append(p_sum)
92+
return param_sum
93+
94+
def check_weight_decay(self, place, model):
95+
main_prog = fluid.framework.Program()
96+
startup_prog = fluid.framework.Program()
97+
startup_prog.random_seed = 1
98+
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
99+
data = fluid.layers.data(
100+
name="words", shape=[1], dtype="int64", lod_level=1)
101+
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
102+
avg_cost = model(data, label, len(self.word_dict))
103+
AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
104+
fluid.optimizer.Adam)
105+
106+
optimizer = AdamW(
107+
learning_rate=self.learning_rate,
108+
weight_decay=self.learning_rate)
109+
110+
optimizer.minimize(avg_cost)
111+
param_sum = self.run_program(place, [data, label])
112+
113+
return param_sum
114+
115+
def check_weight_decay2(self, place, model):
116+
main_prog = fluid.framework.Program()
117+
startup_prog = fluid.framework.Program()
118+
startup_prog.random_seed = 1
119+
with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
120+
data = fluid.layers.data(
121+
name="words", shape=[1], dtype="int64", lod_level=1)
122+
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
123+
124+
avg_cost = model(data, label, len(self.word_dict))
125+
126+
param_list = [(var, var * self.learning_rate)
127+
for var in main_prog.block(0).all_parameters()]
128+
129+
optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
130+
131+
optimizer.minimize(avg_cost)
132+
for params in param_list:
133+
updated_p = fluid.layers.elementwise_sub(
134+
x=params[0], y=params[1])
135+
fluid.layers.assign(input=updated_p, output=params[0])
136+
137+
param_sum = self.run_program(place, [data, label])
138+
return param_sum
139+
140+
def test_weight_decay(self):
141+
for place in get_places():
142+
model = partial(bow_net, is_sparse=False)
143+
param_sum1 = self.check_weight_decay(place, model)
144+
param_sum2 = self.check_weight_decay2(place, model)
145+
146+
for i in range(len(param_sum1)):
147+
assert np.isclose(a=param_sum1[i], b=param_sum2[i], rtol=5e-5)
148+
149+
150+
if __name__ == '__main__':
151+
unittest.main()

0 commit comments

Comments
 (0)