Skip to content

Commit ef169eb

Browse files
Merge pull request #9459 from wanghaoshuang/fix_avg
Make Average Model support for 'moving mean' and 'moving variance' of batch_normal op
2 parents 80d7560 + a7c6bf7 commit ef169eb

File tree

4 files changed

+45
-22
lines changed

4 files changed

+45
-22
lines changed

python/paddle/fluid/framework.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,8 @@ def __init__(self, block, shape, dtype, **kwargs):
11831183

11841184
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
11851185

1186+
self.do_model_average = kwargs.get('do_model_average', None)
1187+
11861188
def __str__(self):
11871189
return self.to_string(True)
11881190

@@ -1203,7 +1205,7 @@ def to_string(self, throw_on_error, with_details=False):
12031205
if with_details:
12041206
res_str = Variable.to_string(self, throw_on_error, True)
12051207
additional_attr = ("trainable", "optimize_attr", "regularizer",
1206-
"gradient_clip_attr")
1208+
"gradient_clip_attr", "do_model_average")
12071209
for attr_name in additional_attr:
12081210
res_str += "%s: %s\n" % (attr_name,
12091211
str(getattr(self, attr_name)))

python/paddle/fluid/layers/nn.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,7 +1516,8 @@ def batch_norm(input,
15161516
in_place=False,
15171517
name=None,
15181518
moving_mean_name=None,
1519-
moving_variance_name=None):
1519+
moving_variance_name=None,
1520+
do_model_average_for_mean_and_var=False):
15201521
"""
15211522
This function helps create an operator to implement
15221523
the BatchNorm layer using the configurations from the input parameters.
@@ -1547,7 +1548,10 @@ def batch_norm(input,
15471548

15481549
mean = helper.create_parameter(
15491550
attr=ParamAttr(
1550-
name=moving_mean_name, initializer=Constant(0.0), trainable=False),
1551+
name=moving_mean_name,
1552+
initializer=Constant(0.0),
1553+
trainable=False,
1554+
do_model_average=do_model_average_for_mean_and_var),
15511555
shape=param_shape,
15521556
dtype=input.dtype)
15531557
mean.stop_gradient = True
@@ -1556,7 +1560,8 @@ def batch_norm(input,
15561560
attr=ParamAttr(
15571561
name=moving_variance_name,
15581562
initializer=Constant(1.0),
1559-
trainable=False),
1563+
trainable=False,
1564+
do_model_average=do_model_average_for_mean_and_var),
15601565
shape=param_shape,
15611566
dtype=input.dtype)
15621567
variance.stop_gradient = True
@@ -3374,14 +3379,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
33743379
Here are some examples to explain it.
33753380
33763381
1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
3377-
is [6, 8], the reshape operator will transform x into a 2-D tensor with
3382+
is [6, 8], the reshape operator will transform x into a 2-D tensor with
33783383
shape [6, 8] and leaving x's data unchanged.
33793384
33803385
2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
33813386
specified is [2, 3, -1, 2], the reshape operator will transform x into a
33823387
4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
3383-
case, one dimension of the target shape is set to -1, the value of this
3384-
dimension is inferred from the total element number of x and remaining
3388+
case, one dimension of the target shape is set to -1, the value of this
3389+
dimension is inferred from the total element number of x and remaining
33853390
dimensions.
33863391
33873392
3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3615,7 +3620,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
36153620
def pad(x, paddings, pad_value=0., name=None):
36163621
"""
36173622
Pads a tensor with a constant value given by :attr:`pad_value`, and the
3618-
padded width is specified by :attr:`paddings`.
3623+
padded width is specified by :attr:`paddings`.
36193624
36203625
Specifically, the number of values padded before the contents of :attr:`x`
36213626
in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3643,7 +3648,7 @@ def pad(x, paddings, pad_value=0., name=None):
36433648
x (Variable): The input tensor variable.
36443649
paddings (list): A list of integers. Its elements specify the padded
36453650
width before and after for each dimension in turn.
3646-
The length of :attr:paddings must be
3651+
The length of :attr:paddings must be
36473652
:math:`rank(x) \\times 2`.
36483653
pad_value (float): The constant value used to pad.
36493654
name(str|None): A name for this layer(optional). If set None, the layer

python/paddle/fluid/optimizer.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
14+
import re
1515
from collections import defaultdict
1616
from paddle.fluid.framework import Program
1717
import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
818818
min_average_window, max_average_window and current update times.
819819
820820
Args:
821-
params_grads: A list of parameter-grad variable pairs.
822821
average_window_rate: The rate of average window.
822+
params_grads: A list of parameter-grad variable pairs.
823823
min_average_window: The minimum size of average window.
824824
max_average_window: The maximum size of average window.
825825
@@ -840,33 +840,46 @@ class ModelAverage(Optimizer):
840840
"""
841841

842842
def __init__(self,
843-
params_grads,
844843
average_window_rate,
844+
params_grads=None,
845845
min_average_window=10000,
846846
max_average_window=10000,
847847
**kwargs):
848848
super(ModelAverage, self).__init__(0.0, **kwargs)
849849
self.average_window = average_window_rate
850850
self.min_average_window = min_average_window
851851
self.max_average_window = max_average_window
852-
self.params_grads = params_grads
852+
853+
self.params_grads = [] if params_grads is None else params_grads
854+
params = {}
855+
for param, grad in self.params_grads:
856+
if param.do_model_average != False:
857+
params[param.name] = (param, grad)
858+
for param in framework.default_main_program().global_block(
859+
).all_parameters():
860+
if param.name not in params and param.do_model_average != False:
861+
grad = param.block.create_var(
862+
name=unique_name.generate(".".join([param.name, 'tmp'])),
863+
dtype=param.dtype,
864+
persistable=False,
865+
stop_gradient=True)
866+
params[param.name] = (param, grad)
867+
self.params_grads = params.values()
868+
853869
for param, grad in self.params_grads:
854-
if grad is not None:
855-
self._append_average_accumulate_op(param)
870+
self._append_average_accumulate_op(param)
856871

857872
self.apply_program = Program()
858873
block = self.apply_program.global_block()
859874
with program_guard(main_program=self.apply_program):
860875
for param_grad in self.params_grads:
861-
if param_grad[1] is not None:
862-
self._add_average_apply_op(block, param_grad)
876+
self._add_average_apply_op(block, param_grad)
863877

864878
self.restore_program = Program()
865879
block = self.restore_program.global_block()
866880
with program_guard(main_program=self.restore_program):
867881
for param_grad in self.params_grads:
868-
if param_grad[1] is not None:
869-
self._add_average_restore_op(block, param_grad)
882+
self._add_average_restore_op(block, param_grad)
870883

871884
def _add_average_apply_op(self, block, param_grad):
872885
param = block.clone_variable(param_grad[0])

python/paddle/fluid/param_attr.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@ def __init__(self,
2828
learning_rate=1.0,
2929
regularizer=None,
3030
trainable=True,
31-
gradient_clip=None):
31+
gradient_clip=None,
32+
do_model_average=None):
3233
self.name = name
3334
self.initializer = initializer
3435
self.learning_rate = learning_rate
3536
self.regularizer = regularizer
3637
self.trainable = trainable
3738
self.gradient_clip = gradient_clip
39+
self.model_average = do_model_average
3840

3941
def set_default_initializer(self, initializer):
4042
if initializer is None:
@@ -80,7 +82,8 @@ def to_kwargs(self, with_initializer=False):
8082
},
8183
'regularizer': self.regularizer,
8284
'trainable': self.trainable,
83-
'gradient_clip_attr': self.gradient_clip
85+
'gradient_clip_attr': self.gradient_clip,
86+
'model_average': self.model_average
8487
}
8588
if with_initializer:
8689
kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ def to_kwargs(self, with_initializer=False):
9093
class WeightNormParamAttr(ParamAttr):
9194
"""
9295
Used for weight normalization. Any field in ParamAttr can also be set here.
93-
Besides, an extra field dim can be set to indicate the dimension except
96+
Besides, an extra field dim can be set to indicate the dimension except
9497
which to normalize.
9598
"""
9699
# List to record the parameters reparameterized by weight normalization.

0 commit comments

Comments
 (0)