Skip to content

Commit 89f5cd8

Browse files
committed
fix(clip): use double to accumulate grad^2
Global Norm need to compulte L2 norm of grads. It will calculate sum{grad^2}. Using float32 is easily overflowed. test=release/1.0.0
1 parent d23c3ff commit 89f5cd8

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

python/paddle/fluid/clip.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,8 @@ def _process_context(self, context, param, grad):
271271
"All parameters' 'clip_norm' of a same group should be the same"
272272
)
273273

274-
local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0))
274+
square = grad * grad
275+
local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
275276
context[self.group_name].append(local_norm_var)
276277

277278
self.context = context
@@ -281,6 +282,7 @@ def _create_operators(self, param, grad):
281282
if group_scale_name not in self.context:
282283
group_norm_var = layers.sums(input=self.context[self.group_name])
283284
group_norm_var = layers.sqrt(x=group_norm_var)
285+
group_norm_var = layers.cast(group_norm_var, 'float32')
284286
clip_var = self.context[self.group_name + "_clip"]
285287
group_scale_var = layers.elementwise_div(
286288
x=clip_var,

0 commit comments

Comments
 (0)