CE bug fix (#10999)

chen2016013 · zhangbo9674 · web-flow · commit 6e6b37393998 · 2025-08-26T10:24:00.000+08:00
* CE bug fix

* Update trainer_callback.py

* CE bug fix

* Update modeling.py

---------

Co-authored-by: zhangbo9674 &lt;zhangbo54@baidu.com&gt;
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
@@ -644,7 +644,7 @@ def on_step_begin(self, args, state, control, **kwargs):
         optimizer = kwargs["optimizer"]
         global skip_count
 
-        if not g_shard_bypass_dygraph_optimizer or skip_count == 0:
+        if (not g_shard_bypass_dygraph_optimizer or skip_count == 0) and hasattr(model, "fp8_quant_weight"):
             model.fp8_quant_weight(True, quant_transpose=False)
             optimizer.clear_param_storage("moe_expert")
             optimizer.clear_param_storage("rms_linear")
@@ -664,6 +664,10 @@ def on_step_begin(self, args, state, control, **kwargs):
         skip_count += 1
 
     def on_optimizer_begin(self, args, state, control, **kwargs):
+        model = kwargs["model"]
         optimizer = kwargs["optimizer"]
-        for name in self.moe_weights_name:
-            reload(optimizer._master_weights[name])
+        global skip_count
+
+        if (not g_shard_bypass_dygraph_optimizer) and hasattr(model, "fp8_quant_weight"):
+            for name in self.moe_weights_name:
+                reload(optimizer._master_weights[name])
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -1538,10 +1538,7 @@ def backward(ctx, dout):
         else:
             assert False, f"invalid {FA_VERSION=}"
 
-        if FA_VERSION == 2:
-            assert not recompute_fa3
-            assert attn_out is not None and softmax_lse is not None
-        if FA_VERSION == 3 and not recompute_fa3:
+        if (FA_VERSION == 3 and not recompute_fa3) or FA_VERSION == 2:
             assert attn_out is not None and softmax_lse is not None
 
         q_ln_t, q_ln_invar = fused_ln.fused_rms_norm(q_init, q_ln_weight, eps)
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -1118,7 +1118,11 @@ def forward_backward(self, inputs, output_grad, combine_bw_event_to_wait=None, p
                 combine_forward_event.current_stream_wait()
                 final_out_event.current_stream_wait()
 
-                inputs = final_out + combine_fwd_out
+                if final_out.shape[-1] != combine_fwd_out.shape[-1]:
+                    final_out[:, :, : combine_fwd_out.shape[-1]] += combine_fwd_out  # 直接广播并相加
+                else:
+                    final_out += combine_fwd_out
+                inputs = final_out
 
                 final_out._record_stream()
                 combine_fwd_out._record_stream()
diff --git a/paddlenlp/transformers/moe_layer.py b/paddlenlp/transformers/moe_layer.py
@@ -904,24 +904,21 @@ def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_probs):
         self.dispatched_indices = dispatched_indices.to(paddle.int32)
 
         total_zipped_tokens = extract_first_if_tuple(hs_2d_dispatched).shape[0]
-        if DSV3_USE_FP8_DISPATCH:
-            (
-                unzipped_tokens,
-                zipped_expertwise_rowmap,
-                unzipped_probs,
-                unzipped_tokens_scale,
-            ) = self.unzip_node.forward(
-                hs_2d_dispatched,
-                self.dispatched_indices,
-                dispatched_probs,
-                topk=self.router_topk,
-                num_experts=num_experts,
-                tokens_per_expert=self.tokens_per_expert,
-            )
-            record_stream_for_multi_input(hs_2d_dispatched)
-            dispatched_indices._record_stream()
-            dispatched_probs._record_stream()
+        (unzipped_tokens, zipped_expertwise_rowmap, unzipped_probs, unzipped_tokens_scale,) = self.unzip_node.forward(
+            hs_2d_dispatched,
+            self.dispatched_indices,
+            dispatched_probs,
+            topk=self.router_topk,
+            num_experts=num_experts,
+            tokens_per_expert=self.tokens_per_expert,
+        )
+        record_stream_for_multi_input(hs_2d_dispatched)
+        dispatched_indices._record_stream()
+        dispatched_probs._record_stream()
+
+        self.unzipped_probs = unzipped_probs.unsqueeze(-1)
 
+        if DSV3_USE_FP8_DISPATCH:
             total_unzipped_tokens = extract_first_if_tuple(unzipped_tokens).shape[0]
             # If adaptive O1 recompute is enabled, determine whether to enable recompute O1 based on the degree of imbalance
             if self.recompute_fwd_gate_up == -1:
@@ -935,8 +932,6 @@ def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_probs):
                     # logger.debug(f"recompute_fwd_gate_up changed to False, Because the receives {unzipped_tokens.shape[0]} Tensors less then {self.seq_length*self.num_experts_per_tok*self.adaptive_remained_O1_recompute_ratio}.")
                     self.set_recompute_fwd_gate_up(False)
 
-            self.unzipped_probs = unzipped_probs.unsqueeze(-1)
-
             # if use_mlp_subbatch is enabled, then split the unzipped_tokens into subbatches
             if self.mlp_fwd_subbatch_rows != 0 and total_unzipped_tokens > self.mlp_fwd_subbatch_rows * 2:
                 assert (
@@ -990,18 +985,6 @@ def forward(self, hs_2d_dispatched, dispatched_indices, dispatched_probs):
                 (unzipped_tokens, unzipped_tokens_scale), unzipped_probs, padding_token_per_experts
             )
         else:
-            (unzipped_tokens, zipped_expertwise_rowmap, unzipped_probs, _,) = self.unzip_node.forward(
-                hs_2d_dispatched,
-                self.dispatched_indices,
-                dispatched_probs,
-                topk=self.router_topk,
-                num_experts=num_experts,
-                tokens_per_expert=self.tokens_per_expert,
-            )
-            hs_2d_dispatched._record_stream()
-            dispatched_indices._record_stream()
-            dispatched_probs._record_stream()
-
             # If adaptive O1 recompute is enabled, determine whether to enable recompute O1 based on the degree of imbalance
             if self.recompute_fwd_gate_up == -1:
                 if (