not recomput o1 (#10953)

risemeup1 · web-flow · commit 6a3fb150a8a9 · 2025-08-16T19:55:47.000+08:00
* not recomput o1

* not recomput o1

* not recomput o1

* not recomput o1

* not recomput o1

* fix
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -49,8 +49,8 @@
     DeepseekV2PretrainedModel,
     DeepseekV2PretrainingCriterion,
     DeepseekV2RMSNorm,
-    set_global_step,
     TemporaryVarContext,
+    set_global_step,
 )
 
 try:
@@ -187,13 +187,13 @@ def forward_without_residual(self, inputs):
         with paddle.no_grad():
             if self.shared_experts is not None:
                 if self.using_post_norm_recompute:
-                    _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
+                    _, _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
                         norm_out, self.shared_experts.w1, self.shared_experts.w2
                     )
                     norm_out = None
                     del norm_out
                 else:
-                    _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
+                    _, _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
                         hidden_states, self.shared_experts.w1, self.shared_experts.w2
                     )
                 residual = residual + shared_expert_output
@@ -229,13 +229,13 @@ def forward(self, inputs):
         with paddle.no_grad():
             if self.shared_experts is not None:
                 if self.using_post_norm_recompute:
-                    _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
+                    _, _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
                         norm_out, self.shared_experts.w1, self.shared_experts.w2
                     )
                     norm_out = None
                     del norm_out
                 else:
-                    _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
+                    _, _, _, shared_expert_output = FP8LinearFunctionBase.fp8_mlp_fwd(
                         hidden_states, self.shared_experts.w1, self.shared_experts.w2
                     )
                 final_hidden_states = final_hidden_states + shared_expert_output
@@ -282,10 +282,18 @@ def backward(self, output_grad):
         residual_grad = hidden_states_grad
         l_aux_grad = paddle.ones(1, dtype=self.l_aux.dtype) * self.alpha
         final_hidden_states_grad = hidden_states_grad
-        
+
         if self.using_post_norm_recompute:
             if self.send_mtp_embed:
-                return (inputs_embeds_mtp_grad, dx, residual_grad, l_aux_grad, final_hidden_states_grad, norm_out, invar)
+                return (
+                    inputs_embeds_mtp_grad,
+                    dx,
+                    residual_grad,
+                    l_aux_grad,
+                    final_hidden_states_grad,
+                    norm_out,
+                    invar,
+                )
             else:
                 return (dx, residual_grad, l_aux_grad, final_hidden_states_grad, norm_out, invar)
         else:
@@ -724,7 +732,6 @@ def post_process_forward(self, inputs, with_residual=True):
         inputs = (inputs_embeds_mtp, *inputs) if self.send_mtp_embed else inputs
         inputs = (*inputs, norm_out) if self.using_post_norm_recompute else inputs
 
-
         if with_residual:
             inputs = self.post_process_node.forward(inputs)
         else:
@@ -736,7 +743,15 @@ def post_process_backward(self, output_grad, event_to_wait=None):
 
         if self.using_post_norm_recompute:
             if self.send_mtp_embed:
-                inputs_embeds_mtp_grad, hidden_states_grad, residual_grad, l_aux_grad, final_hidden_states_grad, norm_out, invar = grad
+                (
+                    inputs_embeds_mtp_grad,
+                    hidden_states_grad,
+                    residual_grad,
+                    l_aux_grad,
+                    final_hidden_states_grad,
+                    norm_out,
+                    invar,
+                ) = grad
             else:
                 hidden_states_grad, residual_grad, l_aux_grad, final_hidden_states_grad, norm_out, invar = grad
         else:
@@ -815,17 +830,30 @@ def combine_backward(self, output_grad, previous_event=None, async_finish=False,
     def mlp_backward(self, output_grad):
         if self.using_post_norm_recompute:
             if self.send_mtp_embed:
-                inputs_embeds_mtp_grad, hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad, norm_out, invar = output_grad
+                (
+                    inputs_embeds_mtp_grad,
+                    hidden_states_grad,
+                    residual_grad,
+                    l_aux_grad,
+                    hidden_states_out_grad,
+                    norm_out,
+                    invar,
+                ) = output_grad
             else:
                 hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad, norm_out, invar = output_grad
         else:
             if self.send_mtp_embed:
-                inputs_embeds_mtp_grad, hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad = output_grad
+                (
+                    inputs_embeds_mtp_grad,
+                    hidden_states_grad,
+                    residual_grad,
+                    l_aux_grad,
+                    hidden_states_out_grad,
+                ) = output_grad
             else:
                 hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad = output_grad
         hs_dispatched_grad, dispatched_probs_grad = self.fp8_fusion_moe_node.mlp_node.backward(hidden_states_out_grad)
 
-
         ret = (hidden_states_grad, residual_grad, l_aux_grad, hs_dispatched_grad, dispatched_probs_grad)
         ret = (inputs_embeds_mtp_grad, *ret) if self.send_mtp_embed else ret
         ret = (*ret, norm_out, invar) if self.using_post_norm_recompute else ret
@@ -845,7 +873,15 @@ def dispatch_backward(self, output_grad, async_finish=False, previous_event=None
                     invar,
                 ) = output_grad
             else:
-                hidden_states_grad, residual_grad, l_aux_grad, hs_dispatched_grad, dispatched_probs_grad, norm_out, invar = output_grad
+                (
+                    hidden_states_grad,
+                    residual_grad,
+                    l_aux_grad,
+                    hs_dispatched_grad,
+                    dispatched_probs_grad,
+                    norm_out,
+                    invar,
+                ) = output_grad
         else:
             if self.send_mtp_embed:
                 (
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -1,16 +1,17 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 from functools import partial
 
@@ -301,16 +302,22 @@ def compute_expert_w_grad(
         return result
 
     @staticmethod
-    def common_fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=False):
+    def common_fp8_mlp_bwd(
+        do3, x_t_fp8, x_t_scale, w1, w2, o1=None, x_fp8=None, x_scale=None, apply_backward_hook=False
+    ):
+        if o1 is not None and (x_fp8 is not None or x_scale is not None):
+            raise ValueError("When o1 is provided, both x_fp8 and x_scale must be None.")
 
-        # # ===== [recompute] o1 = deep_gemm(x_fp8, w1_t_fp8) =====
-        # o1, x_t_fp8, x_t_scale = FP8LinearFunctionBase.compute_fp8_linear(
-        #     x, w1, weight_transpose=True, return_transpose_only=True, return_mode="with_input_transpose_quant"
-        # )
+        if o1 is None:
+            if x_fp8 is None or x_scale is None:
+                raise ValueError("When o1 is None, both x_fp8 and x_scale must be provided.")
 
-        w1_fp8, w1_scale = weight_quant(w1, True)
-        o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=do3.dtype)
-        deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1, num_sms=118)
+            # # ===== [recompute] o1 = deep_gemm(x_fp8, w1_t_fp8) =====
+
+            # Recompute o1 using deep_gemm(x_fp8, w1_t_fp8)
+            w1_fp8, w1_scale = weight_quant(w1, True)
+            o1 = paddle.empty([x_fp8.shape[0], w1_fp8.shape[0]], dtype=do3.dtype)
+            deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale.T), (w1_fp8, w1_scale), o1, num_sms=118)
 
         # ===== [recompute] o2 = swiglu(o1) =====
         o2 = swiglu(o1)
@@ -409,7 +416,15 @@ def fp8_mlp_fwd(x, w1, w2):
         if len(x_orig_shape) > 2:
             o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]])
 
-        return x_fp8, x_scale, o3
+        return o1, x_fp8, x_scale, o3
+
+    @staticmethod
+    def fp8_mlp_fwd_norm_rc(x, norm_w, norm_eps, w1, w2):
+        # ===== compute norm_output =====
+        norm_output, _ = fused_ln.fused_rms_norm(x, norm_w, norm_eps)
+        # ===== compute fp8_mlp_fwd =====
+        _, _, _, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(norm_output, w1, w2)
+        return o3
 
     @staticmethod
     def fp8_mlp_bwd(do3, x, w1, w2, apply_backward_hook=False):
@@ -423,14 +438,30 @@ def fp8_mlp_bwd(do3, x, w1, w2, apply_backward_hook=False):
 
         if apply_backward_hook:
             dx = FP8LinearFunctionBase.common_fp8_mlp_bwd(
-                do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=apply_backward_hook
+                do3,
+                x_t_fp8,
+                x_t_scale,
+                w1,
+                w2,
+                o1=None,
+                x_fp8=x_fp8,
+                x_scale=x_scale,
+                apply_backward_hook=apply_backward_hook,
             )
             if len(x_orig_shape) > 2:
                 dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])
             return dx
         else:
             dx, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(
-                do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, apply_backward_hook=apply_backward_hook
+                do3,
+                x_t_fp8,
+                x_t_scale,
+                w1,
+                w2,
+                o1=None,
+                x_fp8=x_fp8,
+                x_scale=x_scale,
+                apply_backward_hook=apply_backward_hook,
             )
             if len(x_orig_shape) > 2:
                 dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])
@@ -580,14 +611,16 @@ def forward(ctx, x, norm_w, w1, w2, norm_eps):
         norm_output = norm_output.reshape([-1, x_orig_shape[-1]])
 
         # ===== call func fp8_mlp_fwd =====
-        _, _, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(norm_output, w1, w2)
+        _, _, _, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(norm_output, w1, w2)
 
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2:
             o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]])
 
         # ===== save for backward =====
         ctx.save_for_backward(
+            norm_output,
+            invar,
             x,
             norm_w,
             w1,
@@ -604,21 +637,15 @@ def backward(ctx, do3):
         do3 = do3.reshape([-1, do3_orig_shape[-1]])
 
         # ===== recive saved tensors =====
-        x, norm_w, w1, w2, norm_eps, x_orig_shape = ctx.saved_tensor()
-
-        # ===== recompute norm =====
-        norm_output, invar = fused_ln.fused_rms_norm(x, norm_w, norm_eps)
-
-        # ===== compute x_t_fp8, x_t_scale for dw1 =====
-        norm_output = norm_output.reshape([-1, x_orig_shape[-1]])
+        norm_output, invar, x, norm_w, w1, w2, norm_eps, x_orig_shape = ctx.saved_tensor()
 
         x_fp8, x_scale, x_t_fp8, x_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
             norm_output, output_scale_transpose=True, quant_method="1x128", input_transpose=True
         )
 
         # ===== call func common_fp8_mlp_bwd =====
         d_norm_output, dw1, dw2 = FP8LinearFunctionBase.common_fp8_mlp_bwd(
-            do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2
+            do3, x_t_fp8, x_t_scale, w1, w2, o1=None, x_fp8=x_fp8, x_scale=x_scale
         )
 
         # ===== reshape to origin shape =====
@@ -639,13 +666,14 @@ def forward(ctx, x, w1, w2):
         x = x.reshape([-1, x_orig_shape[-1]])
 
         # ===== call func fp8_mlp_fwd =====
-        x_fp8, x_scale, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(x, w1, w2)
+        o1, x_fp8, x_scale, o3 = FP8LinearFunctionBase.fp8_mlp_fwd(x, w1, w2)
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2:
             o3 = o3.reshape([x_orig_shape[0], -1, o3.shape[-1]])
 
         # ===== save for backward =====
         ctx.save_for_backward(
+            o1,
             x_fp8,
             x_scale,
             w1,
@@ -661,7 +689,7 @@ def backward(ctx, do3):
         do3 = do3.reshape([-1, do3_orig_shape[-1]])
 
         # ===== recive saved tensors =====
-        x_fp8, x_scale, w1, w2, x_orig_shape = ctx.saved_tensor()
+        o1, x_fp8, x_scale, w1, w2, x_orig_shape = ctx.saved_tensor()
 
         # ===== compute x_t_fp8, x_t_scale for dw1 =====
         x_dequant_fp16 = paddle.incubate.nn.functional.fused_act_dequant(x_fp8, x_scale.T.contiguous())
@@ -676,8 +704,9 @@ def backward(ctx, do3):
         )
 
         # ===== call func common_fp8_mlp_bwd =====
-        dx = FP8LinearFunctionBase.common_fp8_mlp_bwd(do3, x_fp8, x_scale, x_t_fp8, x_t_scale, w1, w2, True)
-
+        dx = FP8LinearFunctionBase.common_fp8_mlp_bwd(
+            do3, x_t_fp8, x_t_scale, w1, w2, o1=o1, x_fp8=None, x_scale=None, apply_backward_hook=True
+        )
         # ===== reshape to origin shape =====
         if len(x_orig_shape) > 2:
             dx = dx.reshape([x_orig_shape[0], -1, dx.shape[-1]])