save global arg for rms_norm out put

zhangbo9674 · zhangbo9674 · commit 323c590dcc6d · 2025-08-11T13:05:12.000Z
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -77,6 +77,7 @@
 
 global_inputs_embeds_mtp_queue = queue.Queue()
 
+global norm_out
 
 DSV3_USE_FP8_GEMM = os.getenv("DSV3_USE_FP8_GEMM", "False").lower() == "true"
 DSV3_USE_FP8_DISPATCH = os.getenv("DSV3_USE_FP8_DISPATCH", "False").lower() == "true"
@@ -183,12 +184,16 @@ def forward_without_residual(self, inputs):
         with paddle.no_grad():
             if self.shared_experts is not None:
                 if self.using_post_norm_recompute:
-                    shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd_norm_rc(
-                        hidden_states,
-                        self.shared_experts.norm_weight,
-                        self.shared_experts.norm_eps,
-                        self.shared_experts.w1,
-                        self.shared_experts.w2,
+                    global norm_out
+                    # shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd_norm_rc(
+                        # hidden_states,
+                        # self.shared_experts.norm_weight,
+                        # self.shared_experts.norm_eps,
+                        # self.shared_experts.w1,
+                        # self.shared_experts.w2,
+                    # )
+                    _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
+                        norm_output, self.shared_experts.w1, self.shared_experts.w2
                     )
                 else:
                     _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
@@ -566,17 +571,12 @@ def attn_forward(self, inputs):
         inputs = self.attn_and_gate_node.forward(inputs)
 
         if self.send_mtp_embed:
-            if self.using_post_norm_recompute:
-                inputs_embeds_mtp, hidden_states, residual, probs, routing_map, l_aux, norm_out = inputs
-            else:
-                inputs_embeds_mtp, hidden_states, residual, probs, routing_map, l_aux = inputs
+            inputs_embeds_mtp, hidden_states, residual, probs, routing_map, l_aux = inputs
         else:
-            if self.using_post_norm_recompute:
-                hidden_states, residual, probs, routing_map, l_aux, norm_out = inputs
-            else:
-                hidden_states, residual, probs, routing_map, l_aux = inputs
+            hidden_states, residual, probs, routing_map, l_aux = inputs
 
         if self.using_post_norm_recompute:
+            global norm_out
             hs_2d, token_indices, token_probs = self.fp8_fusion_moe_node.dispatch_quant_node.forward(
                 norm_out, probs, routing_map
             )
@@ -1222,6 +1222,7 @@ def attn_compute_for_fusion(self, args):
         _, _, d_model = hidden_states.shape
 
         if self.using_post_norm_recompute:
+            global norm_out
             probs, routing_map, l_aux, _, norm_out = self.mlp.router(hidden_states)
         else:
             probs, routing_map, l_aux, _ = self.mlp.router(hidden_states)
@@ -1236,8 +1237,6 @@ def attn_compute_for_fusion(self, args):
         )
         # append mtp embed if needed
         ret = (inputs_embeds_mtp, *ret) if send_mtp_embed else ret
-        # append norm_out if using post_norm recompute
-        ret = (*ret, norm_out) if self.using_post_norm_recompute else ret
 
         return ret