quant transpost disable (#10987)

liuruyan · web-flow · commit 1ab02cd8e46d · 2025-08-25T17:09:47.000+08:00
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
@@ -645,7 +645,7 @@ def on_step_begin(self, args, state, control, **kwargs):
         global skip_count
 
         if not g_shard_bypass_dygraph_optimizer or skip_count == 0:
-            model.fp8_quant_weight(True)
+            model.fp8_quant_weight(True, quant_transpose=False)
             optimizer.clear_param_storage("moe_expert")
             optimizer.clear_param_storage("rms_linear")
             optimizer.clear_param_storage("memory_attn")
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -215,31 +215,42 @@ def assign_kv_heads(num_kv_heads: int, num_gpus: int):
 class LMHeadFunction(paddle.autograd.PyLayer):
     @staticmethod
     def forward(ctx, x, weight, transpose_y):
-        out = paddle.matmul(x, weight, transpose_y = transpose_y)
+        out = paddle.matmul(x, weight, transpose_y=transpose_y)
 
-        ctx.save_for_backward(x, weight,  transpose_y)
+        ctx.save_for_backward(x, weight, transpose_y)
         return out
 
     @staticmethod
     def backward(ctx, dout):
         if dout.dtype == paddle.float32:
-            dout = dout.cast( paddle.bfloat16)
+            dout = dout.cast(paddle.bfloat16)
 
         x, weight, transpose_y = ctx.saved_tensor()
 
-        dx = paddle.matmul( dout, weight, transpose_y = not transpose_y)
+        dx = paddle.matmul(dout, weight, transpose_y=not transpose_y)
         if transpose_y:
             with paddle.amp.auto_cast(False):
                 paddle._C_ops.fused_linear_param_grad_add(
-                            dout.reshape( [-1, dout.shape[-1]]), x.reshape( [-1, x.shape[-1]]), weight.main_grad, None, True, False
-                        )
+                    dout.reshape([-1, dout.shape[-1]]),
+                    x.reshape([-1, x.shape[-1]]),
+                    weight.main_grad,
+                    None,
+                    True,
+                    False,
+                )
         else:
             with paddle.amp.auto_cast(False):
                 paddle._C_ops.fused_linear_param_grad_add(
-                            x.reshape([-1, x.shape[-1]]), dout.reshape([-1, dout.shape[-1]]), weight.main_grad, None, True, False
-                        )
+                    x.reshape([-1, x.shape[-1]]),
+                    dout.reshape([-1, dout.shape[-1]]),
+                    weight.main_grad,
+                    None,
+                    True,
+                    False,
+                )
         return dx, None
 
+
 def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_parallel_output=True):
     is_fleet_init = True
     tensor_parallel_degree = 1
@@ -269,6 +280,7 @@ def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_parallel_out
         logits = LMHeadFunction.apply(x, y, transpose_y=transpose_y)
         return logits
 
+
 def scaled_dot_product_attention(
     query_states,
     config,
@@ -633,7 +645,9 @@ def _set_cos_sin_cache(self, seq_len):
             dim = self.dim
 
             freq_extra = 1.0 / (self.base ** (paddle.arange(0, dim, 2, dtype=paddle.float32) / dim))
-            freq_inter = 1.0 / (self.scaling_factor * self.base ** (paddle.arange(0, dim, 2, dtype=paddle.float32) / dim))
+            freq_inter = 1.0 / (
+                self.scaling_factor * self.base ** (paddle.arange(0, dim, 2, dtype=paddle.float32) / dim)
+            )
 
             low, high = yarn_find_correction_range(
                 self.beta_fast,
@@ -1059,15 +1073,15 @@ def __init__(self, config: DeepseekV2Config, norm_weight=None, norm_eps=None):
                 )
             set_parameter_color([self.shared_experts.w1, self.shared_experts.w2], "shared_expert")
 
-    def fp8_quant_weight(self, batch_mode=False):
+    def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
         """Quantize weights in FP8 format.
 
         Args:
             batch_mode: If True, quantize all weights in batch mode using the first expert's weights.
                     If False, quantize each expert's weights individually.
         """
 
-        def quantize_weights(weight_list, weight_obj=None):
+        def quantize_weights(weight_list, weight_obj=None, quant_transpose=True):
             """Helper function to quantize a list of weights."""
             if weight_obj is None:
                 weight_obj = weight_list[0]
@@ -1081,31 +1095,32 @@ def quantize_weights(weight_list, weight_obj=None):
             setattr(weight_obj, "fp8_weight_stacked", fp8_weight)
             setattr(weight_obj, "fp8_scale_stacked", fp8_scale)
 
-            # Quantize with transpose
-            fp8_weight_t, fp8_scale_t = paddle.incubate.nn.functional.fused_stack_transpose_quant(
-                weight_list, transpose=True
-            )
-            setattr(weight_obj, "fp8_weight_stacked_transpose", fp8_weight_t)
-            setattr(weight_obj, "fp8_scale_stacked_transpose", fp8_scale_t)
+            if quant_transpose:
+                # Quantize with transpose
+                fp8_weight_t, fp8_scale_t = paddle.incubate.nn.functional.fused_stack_transpose_quant(
+                    weight_list, transpose=True
+                )
+                setattr(weight_obj, "fp8_weight_stacked_transpose", fp8_weight_t)
+                setattr(weight_obj, "fp8_scale_stacked_transpose", fp8_scale_t)
 
         if batch_mode:
             # Batch mode: process all experts' weights together
             expert_w1_list = [expert.w1 for expert in self.experts if expert is not None]
             expert_w2_list = [expert.w2 for expert in self.experts if expert is not None]
 
             if expert_w1_list:
-                quantize_weights(expert_w1_list, expert_w1_list[0])
+                quantize_weights(expert_w1_list, expert_w1_list[0], quant_transpose)
             if expert_w2_list:
-                quantize_weights(expert_w2_list, expert_w2_list[0])
+                quantize_weights(expert_w2_list, expert_w2_list[0], quant_transpose)
         else:
             # Individual mode: process each expert's weights separately
             for expert in self.experts:
                 if expert is not None:
-                    quantize_weights([expert.w1])
-                    quantize_weights([expert.w1])
+                    quantize_weights([expert.w1], quant_transpose=quant_transpose)
+                    quantize_weights([expert.w2], quant_transpose=quant_transpose)
 
         if self.config.n_shared_experts is not None:
-            self.shared_experts.fp8_quant_weight()
+            self.shared_experts.fp8_quant_weight(quant_transpose)
 
     def forward(self, hidden_states):
         if self.using_post_norm_recompute:
@@ -1762,9 +1777,9 @@ def __init__(
         )
         set_parameter_color([self.q_up_weight, self.kv_up_weight], "memory_attn")
 
-    def fp8_quant_weight(self):
-        cache_fp8_weight(self.q_up_weight)
-        cache_fp8_weight(self.kv_up_weight)
+    def fp8_quant_weight(self, quant_transpose=True):
+        cache_fp8_weight(self.q_up_weight, quant_transpose=quant_transpose)
+        cache_fp8_weight(self.kv_up_weight, quant_transpose=quant_transpose)
 
     def forward(self, q_init, kv_init, position_ids):
 
@@ -1890,8 +1905,8 @@ def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
         self.eps = eps
         set_parameter_color([self.q_down_weight], "rms_linear")
 
-    def fp8_quant_weight(self):
-        cache_fp8_weight(self.q_down_weight)
+    def fp8_quant_weight(self, quant_transpose=True):
+        cache_fp8_weight(self.q_down_weight, quant_transpose=quant_transpose)
 
     def forward(self, x):
 
@@ -2053,12 +2068,12 @@ def linear_dtype_gaurd():
 
         self.attn_func = scaled_dot_product_attention
 
-    def fp8_quant_weight(self):
+    def fp8_quant_weight(self, quant_transpose=True):
 
         if DSV3_USE_ATTEN_RECOMPUTE:
-            self.o_proj.fp8_quant_weight()
-            self.memory_recompute_att.fp8_quant_weight()
-            self.fused_rms_norm_linear.fp8_quant_weight()
+            self.o_proj.fp8_quant_weight(quant_transpose=quant_transpose)
+            self.memory_recompute_att.fp8_quant_weight(quant_transpose=quant_transpose)
+            self.fused_rms_norm_linear.fp8_quant_weight(quant_transpose=quant_transpose)
 
     def _init_rope(self):
         if self.config.rope_scaling is None:
@@ -2279,16 +2294,16 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int, layerwise_recompute
                 else DeepseekV2MoE(config)
             )
         else:
-            self.mlp = DeepseekV2MLPClass(config)
+            self.mlp = DeepseekV2MLPClass(config, recompute_fwd_gate_up=True)
 
-    def fp8_quant_weight(self, batch_mode=False):
+    def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
         """fp8_quant_weight"""
         if isinstance(self.mlp, DeepseekV2MoE):
             # logger.info(f"fp8 quant weight for mlp {type(self.mlp)}")
-            self.mlp.fp8_quant_weight(batch_mode)
-            self.self_attn.fp8_quant_weight()
+            self.mlp.fp8_quant_weight(batch_mode, quant_transpose=quant_transpose)
+            self.self_attn.fp8_quant_weight(quant_transpose=quant_transpose)
         elif isinstance(self.mlp, FP8Mlp):
-            self.self_attn.fp8_quant_weight()
+            self.self_attn.fp8_quant_weight(quant_transpose=quant_transpose)
 
     def forward(
         self,
@@ -2496,9 +2511,9 @@ def forward(
     ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
-        
+
         concat_h = paddle.concat([hidden_states, nextn_hidden_state], axis=-1)
-        hidden_states = LMHeadFunction.apply( concat_h, self.eh_proj.weight, False)
+        hidden_states = LMHeadFunction.apply(concat_h, self.eh_proj.weight, False)
 
         layer_outputs = super(DeepseekV2MTPLayer, self).forward(
             hidden_states,
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -1164,9 +1164,7 @@ def __init__(self, forward_node, backward_node, name=""):
         assert isinstance(forward_node, FusionFp8DecoderLayerNode) or isinstance(
             backward_node, FusionFp8DecoderLayerNode
         )
-        assert isinstance(forward_node, DenseDecoderLayerNode) or isinstance(
-            backward_node, DenseDecoderLayerNode
-        )
+        assert isinstance(forward_node, DenseDecoderLayerNode) or isinstance(backward_node, DenseDecoderLayerNode)
         self.forward_node = forward_node
         self.backward_node = backward_node
         self.name = name
@@ -1231,9 +1229,7 @@ def forward_backward(self, inputs, output_grad, combine_bw_event_to_wait=None, p
             paddle.base.core.nvprof_nvtx_pop()  # moe_mlp
 
             paddle.base.core.nvprof_nvtx_push("dense_attn_moe_combine")
-            inputs = self.forward_node.combine_forward(
-                inputs, async_finish=True, allocate_on_comm_stream=True
-            )
+            inputs = self.forward_node.combine_forward(inputs, async_finish=True, allocate_on_comm_stream=True)
             combine_fw_event = deep_ep.get_event_from_comm_stream(self.forward_node.moe_group.id)
             output_grad = self.backward_node.attn_node.backward(output_grad)
             combine_fw_event.calc_stream_wait(self.forward_node.moe_group.id)
@@ -1252,7 +1248,7 @@ def forward_backward(self, inputs, output_grad, combine_bw_event_to_wait=None, p
 def build_overlapped_nodes(forward_chunk, backward_chunk):
     overlap_element_class = (
         FusionFp8DecoderLayerNode if DSV3_USE_FP8_GEMM else DecoderLayerNode,
-        DenseDecoderLayerNode
+        DenseDecoderLayerNode,
     )
     forward_decoder_layer_num = 0
     backward_decoder_layer_num = 0
@@ -1840,11 +1836,7 @@ def attn_compute_for_fusion(self, args):
     def build_schedule_node(self):
         if isinstance(self.mlp, DeepseekV2MoE):
             self.mlp.update_flex_token()
-            if (
-                self.mlp.using_flex_token and
-                DSV3_USE_FP8_GEMM and
-                self.config.num_nextn_predict_layers == 1
-            ):
+            if self.mlp.using_flex_token and DSV3_USE_FP8_GEMM and self.config.num_nextn_predict_layers == 1:
                 prev_send_mtp_embed = self.config.send_mtp_embed
                 self.config.send_mtp_embed = True  # must be True in MTP node
 
@@ -2108,7 +2100,7 @@ def compute_recompute_fwd_gate_up_list(pp_nums, all_dl_nums, dense_dl_nums, reco
         # DON'T init PipelinePretrainedModel
         # PipelinePretrainedModel.__init__(self.super(), config=config)
 
-    def fp8_quant_weight(self, batch_mode=False):
+    def fp8_quant_weight(self, batch_mode=False, quant_transpose=True):
         """fp8_quant_weight"""
         with paddle.no_grad():
             for i, layer in self._sub_layers.items():
@@ -2117,9 +2109,9 @@ def fp8_quant_weight(self, batch_mode=False):
                 ):
                     for i, sub_layer in layer.named_sublayers():
                         if isinstance(sub_layer, DeepseekV2DecoderLayer) and hasattr(sub_layer, "fp8_quant_weight"):
-                            sub_layer.fp8_quant_weight(batch_mode)
+                            sub_layer.fp8_quant_weight(batch_mode, quant_transpose)
                 if isinstance(layer, DeepseekV2DecoderLayer) and hasattr(layer, "fp8_quant_weight"):
-                    layer.fp8_quant_weight(batch_mode)
+                    layer.fp8_quant_weight(batch_mode, quant_transpose)
 
     def get_loss_fn(self, config):
         return DeepseekV2PretrainingCriterionPipe(config)
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -86,8 +86,12 @@ def _get_fp8_weight_and_scale(weight, stacked=False, transpose=False):
 
 
 def fused_stack_quant(expert_weight_list, transpose=False):
-    if hasattr(expert_weight_list[0], "fp8_weight_stacked"):
-        w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=transpose)
+    if transpose is False and hasattr(expert_weight_list[0], "fp8_weight_stacked"):
+        w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=False)
+    elif transpose is True and hasattr(expert_weight_list[0], "fp8_weight_stacked_transpose"):
+        w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=True)
+    elif transpose is True and hasattr(expert_weight_list[0], "fp8_weight_stacked"):
+        w, scale = _get_fp8_weight_and_scale(expert_weight_list[0], stacked=True, transpose=False)
     else:
         w, scale = paddle.incubate.nn.functional.fused_stack_transpose_quant(expert_weight_list, transpose=transpose)
     return w, scale
@@ -97,6 +101,8 @@ def weight_quant(weight, transpose=False):
     if transpose:
         if hasattr(weight, "fp8_weight_transpose"):
             return weight.fp8_weight_transpose, weight.fp8_scale_transpose
+        elif hasattr(weight, "fp8_weight"):
+            return weight.fp8_weight.T.contiguous(), weight.fp8_scale.T.contiguous()
         else:
             return paddle.incubate.nn.functional.fp8_quant_blockwise(
                 weight,
@@ -590,21 +596,32 @@ def forward(self, x):
         return FP8LinearFunction.apply(x, self, keep_x=False)
 
 
-def cache_fp8_weight(weight):
+def cache_fp8_weight(weight, quant_transpose=True):
     if hasattr(weight, "fp8_weight"):
         return
-    w_fp8, w_scale, w_t_fp8, w_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-        weight,
-        output_scale_transpose=False,
-        quant_method="128x128",
-        input_transpose=True,
-        return_transpose_only=False,
-    )
 
-    setattr(weight, "fp8_weight_transpose", w_t_fp8)
-    setattr(weight, "fp8_scale_transpose", w_t_scale)
-    setattr(weight, "fp8_weight", w_fp8)
-    setattr(weight, "fp8_scale", w_scale)
+    if quant_transpose:
+        w_fp8, w_scale, w_t_fp8, w_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            weight,
+            output_scale_transpose=False,
+            quant_method="128x128",
+            input_transpose=True,
+            return_transpose_only=False,
+        )
+        setattr(weight, "fp8_weight_transpose", w_t_fp8)
+        setattr(weight, "fp8_scale_transpose", w_t_scale)
+        setattr(weight, "fp8_weight", w_fp8)
+        setattr(weight, "fp8_scale", w_scale)
+    else:
+        w_fp8, w_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            weight,
+            output_scale_transpose=False,
+            quant_method="128x128",
+            input_transpose=False,
+            return_transpose_only=False,
+        )
+        setattr(weight, "fp8_weight", w_fp8)
+        setattr(weight, "fp8_scale", w_scale)
 
 
 class FP8KeepXLinear(paddle.nn.Layer):
@@ -619,8 +636,8 @@ def __init__(self, in_features: int, out_features: int, bias_attr: bool = False)
         )
         set_parameter_color([self.weight], "attn_out_project")
 
-    def fp8_quant_weight(self):
-        cache_fp8_weight(self.weight)
+    def fp8_quant_weight(self, quant_transpose=True):
+        cache_fp8_weight(self.weight, quant_transpose=quant_transpose)
 
     def forward(self, x):
         return FP8LinearFunction.apply(x, self, keep_x=True)
@@ -781,9 +798,9 @@ def __init__(
             is_bias=False,
         )
 
-    def fp8_quant_weight(self):
-        cache_fp8_weight(self.w1)
-        cache_fp8_weight(self.w2)
+    def fp8_quant_weight(self, quant_transpose=True):
+        cache_fp8_weight(self.w1, quant_transpose)
+        cache_fp8_weight(self.w2, quant_transpose)
 
     def forward(self, x):
         if self.using_post_norm_recompute:
@@ -865,6 +882,10 @@ def fwd_gate_up(self, x, expert_w1, num_expert, tokens_per_expert, m_indices=Non
         w1_t_quant = w1_t_quant.reshape([num_expert, -1, w1_t_quant.shape[-1]])
         w1_t_scale = w1_t_scale.reshape([num_expert, -1, w1_t_scale.shape[-1]])
 
+        if hasattr(expert_w1[0], "fp8_weight_stacked") and not hasattr(expert_w1[0], "fp8_weight_stacked_transpose"):
+            w1_t_quant = w1_t_quant.contiguous().transpose([0, 2, 1]).contiguous()
+            w1_t_scale = w1_t_scale.contiguous().transpose([0, 2, 1]).contiguous()
+
         if x is None:
             x_fp8, x_scale = self.input_fp8, self.input_scale
             assert x_fp8 is not None and x_scale is not None
@@ -914,6 +935,10 @@ def fwd_down(
         w2_quant = w2_quant.reshape([num_expert, -1, w2_quant.shape[-1]])
         w2_scale = w2_scale.reshape([num_expert, -1, w2_scale.shape[-1]])
 
+        if hasattr(expert_w2[0], "fp8_weight_stacked") and not hasattr(expert_w2[0], "fp8_weight_stacked_transpose"):
+            w2_quant = w2_quant.contiguous().transpose([0, 2, 1]).contiguous()
+            w2_scale = w2_scale.contiguous().transpose([0, 2, 1]).contiguous()
+
         # quant o2
         with paddle.amp.auto_cast(False):
             unzipped_probs = unzipped_probs.squeeze(-1)