Fix mtp bug when send_mtp_embed=True (#10909)

chen2016013 · root · web-flow · commit 87fa74414420 · 2025-08-07T11:44:27.000+08:00
* Fix mtp bug

* fix mtp bug

* Update fp8_utils.py

* fix mtp bug

---------

Co-authored-by: root &lt;root@yqlcc01-bbc-yqonlinea-com-1567437.yqlcc01.baidu.com&gt;
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -192,6 +192,7 @@ def forward_without_residual(self, inputs):
 
         if self.send_mtp_embed:
             hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            self.mtp_embed_shape = inputs_embeds_mtp.shape  # 保存mtp_embed的shape用于反向传播
 
         return return_args(hidden_states)
 
@@ -227,37 +228,47 @@ def forward(self, inputs):
 
         if self.send_mtp_embed:
             hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            self.mtp_embed_shape = inputs_embeds_mtp.shape  # 保存mtp_embed的shape用于反向传播
 
         return return_args(hidden_states)
 
     @paddle.no_grad()
     def backward(self, output_grad):
         (do3,) = output_grad
 
-        assert not self.send_mtp_embed, "not support have mtp have yet"
+        if self.send_mtp_embed:
+            # 分割梯度：do3的前部分对应hidden_states，后部分对应inputs_embeds_mtp
+            hidden_size = do3.shape[-1] - self.mtp_embed_shape[-1]
+            hidden_states_grad = do3[..., :hidden_size]
+            inputs_embeds_mtp_grad = do3[..., hidden_size:]
+        else:
+            hidden_states_grad = do3
+            inputs_embeds_mtp_grad = None
+
         if self.using_post_norm_recompute:
             dx = FP8LinearFunctionBase.fp8_mlp_bwd_norm_rc(
-                do3,
+                hidden_states_grad,
                 self.x,
                 self.shared_experts.norm_weight,
                 self.shared_experts.norm_eps,
                 self.shared_experts.w1,
                 self.shared_experts.w2,
             )
         else:
-            dx = FP8LinearFunctionBase.fp8_mlp_bwd(do3, self.x, self.shared_experts.w1, self.shared_experts.w2)
+            dx = FP8LinearFunctionBase.fp8_mlp_bwd(
+                hidden_states_grad, self.x, self.shared_experts.w1, self.shared_experts.w2, True
+            )
 
         self.x = None
 
-        residual_grad = do3
-
-        hidden_states_grad = dx
-
+        residual_grad = hidden_states_grad
         l_aux_grad = paddle.ones(1, dtype=self.l_aux.dtype) * self.alpha
+        final_hidden_states_grad = hidden_states_grad
 
-        final_hidden_states_grad = do3
-
-        return (hidden_states_grad, residual_grad, l_aux_grad, final_hidden_states_grad)
+        if self.send_mtp_embed:
+            return (inputs_embeds_mtp_grad, dx, residual_grad, l_aux_grad, final_hidden_states_grad)
+        else:
+            return (dx, residual_grad, l_aux_grad, final_hidden_states_grad)
 
 
 class DecoderLayerNode(ScheduleNode):
@@ -749,6 +760,9 @@ def attn_backward(self, output_grad):
                 hs_grad,
                 token_probs_grad,
             ) = output_grad
+            inputs_embeds_mtp_grad_shape = hidden_states_grad.shape
+            inputs_embeds_mtp_grad_shape[-1] = -1
+            inputs_embeds_mtp_grad = inputs_embeds_mtp_grad.view(inputs_embeds_mtp_grad_shape)
         else:
             hidden_states_grad, residual_grad, l_aux_grad, hs_grad, token_probs_grad = output_grad
 
@@ -906,8 +920,11 @@ def forward_backward(self, inputs, output_grad, combine_bw_event_to_wait=None, p
             combine_forward_event.calc_stream_wait(self.forward_node.moe_group.id)
 
             final_out = self.forward_node.post_process_node.forward_without_residual(inputs)
-            inputs = final_out + combine_fwd_out
-
+            if final_out.shape[-1] != combine_fwd_out.shape[-1]:
+                final_out[:, :, : combine_fwd_out.shape[-1]] += combine_fwd_out  # 直接广播并相加
+            else:
+                final_out += combine_fwd_out
+            inputs = final_out
             combine_fwd_out._record_stream()
 
         paddle.base.core.nvprof_nvtx_pop()
@@ -1072,7 +1089,7 @@ def forward(self, args):
         if self.config.send_mtp_embed:
             batch_size, _, hidden_size = hidden_states.shape
             batch_size_mtp = hidden_size // (self.config.num_nextn_predict_layers + 1)
-            inputs_embeds_mtp = hidden_states[..., -batch_size_mtp:]
+            inputs_embeds_mtp = hidden_states[..., batch_size_mtp:]
             hidden_states = hidden_states[..., :batch_size_mtp]
 
         has_gradient = not hidden_states.stop_gradient
@@ -1129,7 +1146,7 @@ def attn_compute(self, args):
 
         batch_size, _, hidden_size = hidden_states.shape
         batch_size_mtp = hidden_size // (self.config.num_nextn_predict_layers + 1)
-        inputs_embeds_mtp = hidden_states[..., -batch_size_mtp:]
+        inputs_embeds_mtp = hidden_states[..., batch_size_mtp:]
         hidden_states = hidden_states[..., :batch_size_mtp]
 
         def attn_compute_func(hidden_states):
@@ -1162,7 +1179,7 @@ def attn_compute_for_fusion(self, args):
             # slice from holy tensor
             batch_size, _, hidden_size = hidden_states.shape
             batch_size_mtp = hidden_size // (self.config.num_nextn_predict_layers + 1)
-            inputs_embeds_mtp = hidden_states[..., -batch_size_mtp:]
+            inputs_embeds_mtp = hidden_states[..., batch_size_mtp:]
             hidden_states = hidden_states[..., :batch_size_mtp]
 
         hidden_states, residual = self.self_attn_compute(hidden_states)
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -327,7 +327,7 @@ def fp8_mlp_fwd_norm_rc(x, norm_w, norm_eps, w1, w2):
         return o3
 
     @staticmethod
-    def fp8_mlp_bwd(do3, x, w1, w2):
+    def fp8_mlp_bwd(do3, x, w1, w2, apply_backward_hook=False):
         do3_orig_shape = do3.shape
         do3 = do3.reshape([-1, do3_orig_shape[-1]])
 
@@ -336,22 +336,29 @@ def fp8_mlp_bwd(do3, x, w1, w2):
 
         x_fp8, x_scale, x_t_fp8, x_t_scale = FP8LinearFunctionBase.padding_and_quant_input(x)
 
-        dx = FP8LinearFunctionBase.common_fp8_mlp_bwd(
-            do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=True
-        )
-
-        if len(x_orig_shape) > 2:
-            dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])
+        if apply_backward_hook:
+            dx = FP8LinearFunctionBase.common_fp8_mlp_bwd(
+                do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=apply_backward_hook
+            )
+            if len(x_orig_shape) > 2:
+                dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])
+            return dx
+        else:
+            dx, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(
+                do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=apply_backward_hook
+            )
+            if len(x_orig_shape) > 2:
+                dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])
 
-        return dx
+            return dx, dw1, dw2
 
     @staticmethod
     def fp8_mlp_bwd_norm_rc(do3, x, norm_w, norm_eps, w1, w2):
         # ===== recompute norm_output =====
         norm_output, invar = fused_ln.fused_rms_norm(x, norm_w, norm_eps)
 
         # ===== compute fp8_mlp_fwd =====
-        d_norm_output = FP8LinearFunctionBase.fp8_mlp_bwd(do3, norm_output, w1, w2)
+        d_norm_output = FP8LinearFunctionBase.fp8_mlp_bwd(do3, norm_output, w1, w2, True)
 
         # ===== compute norm grad =====
         dx, d_rms_norm_weight = fused_ln.fused_rms_norm_grad_func(x, norm_w, invar, d_norm_output, norm_eps)
@@ -480,7 +487,7 @@ def forward(ctx, x, norm_w, w1, w2, norm_eps):
         norm_output = norm_output.reshape([-1, x_orig_shape[-1]])
 
         # ===== call func fp8_mlp_fwd =====
-        o3, _, _ = FP8LinearFunctionBase.fp8_mlp_fwd(norm_output, w1, w2)
+        _, _, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(norm_output, w1, w2)
 
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2:
@@ -517,7 +524,7 @@ def backward(ctx, do3):
         )
 
         # ===== call func common_fp8_mlp_bwd =====
-        d_norm_output, dw1, dw2 = FP8LinearFunctionBase.fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2)
+        d_norm_output, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2)
 
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2:
@@ -574,7 +581,7 @@ def backward(ctx, do3):
         )
 
         # ===== call func common_fp8_mlp_bwd =====
-        dx, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2)
+        dx, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, False)
 
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2: