Update quant cache (#11007)

liuruyan · zhangbo9674 · web-flow · commit 79c421f22dc5 · 2025-08-28T10:01:22.000+08:00
* fix

* update quant cache

---------

Co-authored-by: zhangbo9674 &lt;zhangbo54@baidu.com&gt;
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
@@ -645,7 +645,7 @@ def on_step_begin(self, args, state, control, **kwargs):
         global skip_count
 
         if (not g_shard_bypass_dygraph_optimizer or skip_count == 0) and hasattr(model, "fp8_quant_weight"):
-            model.fp8_quant_weight(True, quant_transpose=False)
+            model.fp8_quant_weight(True, quant_transpose=True)
             optimizer.clear_param_storage("moe_expert")
             optimizer.clear_param_storage("rms_linear")
             optimizer.clear_param_storage("memory_attn")
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -1073,35 +1073,49 @@ def __init__(self, config: DeepseekV2Config, norm_weight=None, norm_eps=None):
                 )
             set_parameter_color([self.shared_experts.w1, self.shared_experts.w2], "shared_expert")
 
-    def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
+    def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
         """Quantize weights in FP8 format.
 
         Args:
             batch_mode: If True, quantize all weights in batch mode using the first expert's weights.
                     If False, quantize each expert's weights individually.
         """
 
-        def quantize_weights(weight_list, weight_obj=None, quant_transpose=True):
+        def quantize_weights(weight_list, weight_obj=None, quant_transpose=None):
             """Helper function to quantize a list of weights."""
             if weight_obj is None:
                 weight_obj = weight_list[0]
-            if hasattr(weight_obj, "fp8_weight_stacked"):
+            if hasattr(weight_obj, "fp8_weight_stacked") or hasattr(weight_obj, "fp8_weight_stacked_transpose"):
                 return
 
-            # Quantize without transpose
-            fp8_weight, fp8_scale = paddle.incubate.nn.functional.fused_stack_transpose_quant(
-                weight_list, transpose=False
-            )
-            setattr(weight_obj, "fp8_weight_stacked", fp8_weight)
-            setattr(weight_obj, "fp8_scale_stacked", fp8_scale)
+            if quant_transpose is None:
+                fp8_weight, fp8_scale = paddle.incubate.nn.functional.fused_stack_transpose_quant(
+                    weight_list, transpose=False
+                )
+                setattr(weight_obj, "fp8_weight_stacked", fp8_weight)
+                setattr(weight_obj, "fp8_scale_stacked", fp8_scale)
 
-            if quant_transpose:
-                # Quantize with transpose
                 fp8_weight_t, fp8_scale_t = paddle.incubate.nn.functional.fused_stack_transpose_quant(
                     weight_list, transpose=True
                 )
                 setattr(weight_obj, "fp8_weight_stacked_transpose", fp8_weight_t)
                 setattr(weight_obj, "fp8_scale_stacked_transpose", fp8_scale_t)
+            elif quant_transpose is False:
+                # Only quantize without transpose
+                fp8_weight, fp8_scale = paddle.incubate.nn.functional.fused_stack_transpose_quant(
+                    weight_list, transpose=False
+                )
+                setattr(weight_obj, "fp8_weight_stacked", fp8_weight)
+                setattr(weight_obj, "fp8_scale_stacked", fp8_scale)
+            elif quant_transpose is True:
+                # Only quantize with transpose
+                fp8_weight_t, fp8_scale_t = paddle.incubate.nn.functional.fused_stack_transpose_quant(
+                    weight_list, transpose=True
+                )
+                setattr(weight_obj, "fp8_weight_stacked_transpose", fp8_weight_t)
+                setattr(weight_obj, "fp8_scale_stacked_transpose", fp8_scale_t)
+            else:
+                raise ValueError("Invalid value for `quant_transpose`.")
 
         if batch_mode:
             # Batch mode: process all experts' weights together
@@ -1830,7 +1844,7 @@ def __init__(
         )
         set_parameter_color([self.q_up_weight, self.kv_up_weight], "memory_attn")
 
-    def fp8_quant_weight(self, quant_transpose=True):
+    def fp8_quant_weight(self, quant_transpose=None):
         cache_fp8_weight(self.q_up_weight, quant_transpose=quant_transpose)
         cache_fp8_weight(self.kv_up_weight, quant_transpose=quant_transpose)
 
@@ -1959,7 +1973,7 @@ def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
         self.eps = eps
         set_parameter_color([self.q_down_weight], "rms_linear")
 
-    def fp8_quant_weight(self, quant_transpose=True):
+    def fp8_quant_weight(self, quant_transpose=None):
         cache_fp8_weight(self.q_down_weight, quant_transpose=quant_transpose)
 
     def forward(self, x):
@@ -2124,7 +2138,7 @@ def linear_dtype_gaurd():
 
         self.attn_func = scaled_dot_product_attention
 
-    def fp8_quant_weight(self, quant_transpose=True):
+    def fp8_quant_weight(self, quant_transpose=None):
 
         if DSV3_USE_ATTEN_RECOMPUTE:
             self.o_proj.fp8_quant_weight(quant_transpose=quant_transpose)
@@ -2356,7 +2370,7 @@ def __init__(
         else:
             self.mlp = DeepseekV2MLPClass(config, recompute_fwd_gate_up=True)
 
-    def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
+    def fp8_quant_weight(self, batch_mode=False, quant_transpose=None):
         """fp8_quant_weight"""
         if isinstance(self.mlp, DeepseekV2MoE):
             # logger.info(f"fp8 quant weight for mlp {type(self.mlp)}")
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -92,6 +92,8 @@ def fused_stack_quant(expert_weight_list, transpose=False):
         w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=True)
     elif transpose is True and hasattr(expert_weight_list[0], "fp8_weight_stacked"):
         w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=False)
+    elif transpose is False and hasattr(expert_weight_list[0], "fp8_weight_stacked_transpose"):
+        w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=True)
     else:
         w, scale = paddle.incubate.nn.functional.fused_stack_transpose_quant(expert_weight_list, transpose=transpose)
     return w, scale
@@ -114,6 +116,8 @@ def weight_quant(weight, transpose=False):
     else:
         if hasattr(weight, "fp8_weight"):
             return weight.fp8_weight, weight.fp8_scale
+        elif hasattr(weight, "fp8_weight_transpose"):
+            return weight.fp8_weight_transpose.T.contiguous(), weight.fp8_scale_transpose.T.contiguous()
         else:
             return paddle.incubate.nn.functional.fp8_quant_blockwise(
                 weight,
@@ -596,23 +600,33 @@ def forward(self, x):
         return FP8LinearFunction.apply(x, self, keep_x=False)
 
 
-def cache_fp8_weight(weight, quant_transpose=True):
-    if hasattr(weight, "fp8_weight"):
+def cache_fp8_weight(weight, quant_transpose=None):
+    if hasattr(weight, "fp8_weight") or hasattr(weight, "fp8_weight_transpose"):
         return
-
-    if quant_transpose:
+    if quant_transpose is None:
         w_fp8, w_scale, w_t_fp8, w_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
             weight,
             output_scale_transpose=False,
             quant_method="128x128",
             input_transpose=True,
             return_transpose_only=False,
         )
+
         setattr(weight, "fp8_weight_transpose", w_t_fp8)
         setattr(weight, "fp8_scale_transpose", w_t_scale)
         setattr(weight, "fp8_weight", w_fp8)
         setattr(weight, "fp8_scale", w_scale)
-    else:
+    elif quant_transpose is True:
+        w_t_fp8, w_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            weight,
+            output_scale_transpose=False,
+            quant_method="128x128",
+            input_transpose=True,
+            return_transpose_only=True,
+        )
+        setattr(weight, "fp8_weight_transpose", w_t_fp8)
+        setattr(weight, "fp8_scale_transpose", w_t_scale)
+    elif quant_transpose is False:
         w_fp8, w_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
             weight,
             output_scale_transpose=False,
@@ -622,6 +636,8 @@ def cache_fp8_weight(weight, quant_transpose=True):
         )
         setattr(weight, "fp8_weight", w_fp8)
         setattr(weight, "fp8_scale", w_scale)
+    else:
+        raise ValueError("quant_transpose must be either True, False or None.")
 
 
 class FP8KeepXLinear(paddle.nn.Layer):
@@ -636,7 +652,7 @@ def __init__(self, in_features: int, out_features: int, bias_attr: bool = False)
         )
         set_parameter_color([self.weight], "attn_out_project")
 
-    def fp8_quant_weight(self, quant_transpose=True):
+    def fp8_quant_weight(self, quant_transpose=None):
         cache_fp8_weight(self.weight, quant_transpose=quant_transpose)
 
     def forward(self, x):
@@ -798,7 +814,7 @@ def __init__(
             is_bias=False,
         )
 
-    def fp8_quant_weight(self, quant_transpose=True):
+    def fp8_quant_weight(self, quant_transpose=None):
         cache_fp8_weight(self.w1, quant_transpose)
         cache_fp8_weight(self.w2, quant_transpose)
 
@@ -980,6 +996,10 @@ def bwd_dowm_input(self, expert_w2, unzipped_grad, o1, tokens_per_expert, m_indi
         bw_w2_quant = bw_w2_quant.reshape([len(expert_w2), -1, bw_w2_quant.shape[-1]])
         bw_w2_scale = bw_w2_scale.reshape([len(expert_w2), -1, bw_w2_scale.shape[-1]])
 
+        if hasattr(expert_w2[0], "fp8_weight_stacked_transpose") and not hasattr(expert_w2[0], "fp8_weight_stacked"):
+            bw_w2_quant = bw_w2_quant.contiguous().transpose([0, 2, 1]).contiguous()
+            bw_w2_scale = bw_w2_scale.contiguous().transpose([0, 2, 1]).contiguous()
+
         # compute gemm
         if isinstance(unzipped_grad, tuple):
             (unzipped_grad_fp8, unzipped_grad_scale) = unzipped_grad
@@ -1024,6 +1044,10 @@ def bwd_gate_up_input(self, do1, expert_w1, tokens_per_expert, m_indices=None, d
         bw_w1_quant = bw_w1_quant.reshape([len(expert_w1), -1, bw_w1_quant.shape[-1]])
         bw_w1_scale = bw_w1_scale.reshape([len(expert_w1), -1, bw_w1_scale.shape[-1]])
 
+        if hasattr(expert_w1[0], "fp8_weight_stacked_transpose") and not hasattr(expert_w1[0], "fp8_weight_stacked"):
+            bw_w1_quant = bw_w1_quant.contiguous().transpose([0, 2, 1]).contiguous()
+            bw_w1_scale = bw_w1_scale.contiguous().transpose([0, 2, 1]).contiguous()
+
         # quant do1
         do1_fp8, do1_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
             do1, output_scale_transpose=True, quant_method="1x128", input_transpose=False