PaddlePaddle
diff --git a/‎paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu‎
Lines changed: 7 additions & 0 deletions b/‎paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py‎
Lines changed: 19 additions & 6 deletions b/‎python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py‎
Lines changed: 32 additions & 15 deletions b/‎python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py‎
Lines changed: 32 additions & 15 deletions
@@ -134,6 +134,13 @@ void FusedLinearParamGradAdd(const Context &ctx,
 
   bool use_addto = false;
   if (dweight_out) {
+    if (dweight_out->dtype() == phi::DataType::FLOAT16) {
+      LOG_FIRST_N(WARNING, 1)
+          << "fused_linear_param_grad_add op may have problems when "
+             "master_grad is not enabled and use fp16-O2 in stage2, users "
+             "should pay attention to the correctness of the result of the "
+             "grad accumulation in stage2.";
+    }
     if (dweight) {
       use_addto = true;
       *dweight_out = dweight.get();
 
@@ -194,13 +194,26 @@ def __init__(
                 and hcg.get_parallel_mode() is not ParallelMode.DATA_PARALLEL
                 and not offload
             ):
-                self._optim._grad_clip = HybridParallelClipGrad(
-                    self._optim._grad_clip, hcg
-                )
+                if self.use_main_grad:
+                    self._optim._inner_opt._grad_clip = HybridParallelClipGrad(
+                        self._optim._inner_opt._grad_clip, hcg
+                    )
+                else:
+                    self._optim._grad_clip = HybridParallelClipGrad(
+                        self._optim._grad_clip, hcg
+                    )
             else:
-                self._optim._grad_clip = GroupShardedClipGrad(
-                    self._optim._grad_clip, paddle.get_device(), self._group
-                )
+                if self.use_main_grad:
+                    self._optim._inner_opt._grad_clip = GroupShardedClipGrad(
+                        self._optim._inner_opt._grad_clip,
+                        paddle.get_device(),
+                        self._group,
+                    )
+                else:
+                    self._optim._grad_clip = GroupShardedClipGrad(
+                        self._optim._grad_clip, paddle.get_device(), self._group
+                    )
+
             if self._optim._parameter_list and isinstance(
                 self._optim._parameter_list[0], dict
             ):
 
@@ -155,6 +155,8 @@ def __init__(
         # Set backward pass hooks
         self._bw_hooks = []
 
+        self.scale_in_opt = False
+
         # TODO (Baibaifan) Set tasks flow support asynchronous communicate
         # self._tasks_flow = deque()
 
@@ -232,13 +234,20 @@ def _clear_gradients(self):
 
     def _grad_scale(self):
         """
-        Before the optimization, scale the gradients before allreduce of dp_group.
+        this function will do 2 things:
+        1.  Before the optimization, scale main_grad to support gradient merge if param has main_grad, or to support fused_linear_param_grad_add gradient merge.
+        2.  Before the optimization, scale the gradients before allreduce of dp_group.
         """
 
-        if self._dp_group is None or self._dp_group.nranks <= 1:
-            return
+        need_dp_scale = self._dp_group is not None and self._dp_group.nranks > 1
+        if self.scale_in_opt:
+            scale_factor = self._world_size_scaling
         else:
-            scale_factor = 1.0 / (self._dp_group.nranks)
+            scale_factor = 1.0
+
+        if need_dp_scale:
+            dp_scale_factor = 1.0 / (self._dp_group.nranks)
+            scale_factor = scale_factor * dp_scale_factor
 
         # Scale grad storages
         for dtype in self._grad_storages.keys():
@@ -249,7 +258,6 @@ def _grad_scale(self):
                 self._grad_storages[dtype][self._rank].buffer.scale_(
                     scale=scale_factor
                 )
-
         # Scale grads of params
         with paddle.no_grad():
             for param in self._trainable_params:
@@ -258,11 +266,14 @@ def _grad_scale(self):
                         param.main_grad.scale_(scale=scale_factor)
                     elif param.grad is not None:
                         param.grad.scale_(scale=scale_factor)
-                # param._reset_grad_inplace_version(True)
 
-            # Scale grads of master params with offload strategy
+        # Scale grads of master params with offload strategy
         if self._offload:
-            self._sharding_optimizers[0]._offload_scale_grad(scale_factor)
+            if need_dp_scale is False:
+                return
+            self._sharding_optimizers[0]._offload_scale_grad(
+                scale=1.0 / (self._dp_group.nranks)
+            )
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -379,15 +390,21 @@ def _set_reduce_overlap(self, reduce_overlap):
     def _get_scaled_grad_fn(self, param):
         @paddle.autograd.no_grad()
         def scale(grad):
-            if hasattr(param, "main_grad"):
-                param.main_grad.scale_(self._world_size_scaling)
-            else:
-                if grad is not None and grad._is_initialized():
+            # do gradient scale separately
+            # For grad scale, we need to do it in the backward hook due to fp16 may overflow if we first add grad and then scale
+            # For main_grad scale and fused_linear_param_grad_add, we do scale in the optimizer.
+            if not self.scale_in_opt:
+                if (
+                    not hasattr(param, "main_grad")
+                    and grad is not None
+                    and grad.dtype == Type.fp16.value
+                ):
+                    assert (
+                        grad._is_initialized()
+                    ), "grad should be initialized in stage2"
                     grad.scale_(self._world_size_scaling)
                 else:
-                    assert param.grad is not None
-                    assert param.grad._is_initialized()
-                    param.grad.scale_(self._world_size_scaling)
+                    self.scale_in_opt = True
 
         return scale