Add config control for quant cache (#10926)

zhangbo9674 · web-flow · commit 542c1b58aeff · 2025-08-11T15:43:23.000+08:00
* add config

* refine
diff --git a/paddlenlp/transformers/deepseek_v2/configuration.py b/paddlenlp/transformers/deepseek_v2/configuration.py
@@ -186,6 +186,8 @@ def __init__(
         is_split_group_gemm=False,
         fakse_gate_restrict_balance=False,
         adaptive_remained_O1_recompute_ratio=0,
+        offline_quant_expert_weight=True,
+        clear_origin_weight_when_offline_quant=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -241,6 +243,8 @@ def __init__(
         self.is_split_group_gemm = is_split_group_gemm
         self.fakse_gate_restrict_balance = fakse_gate_restrict_balance
         self.adaptive_remained_O1_recompute_ratio = adaptive_remained_O1_recompute_ratio
+        self.offline_quant_expert_weight = offline_quant_expert_weight
+        self.clear_origin_weight_when_offline_quant = clear_origin_weight_when_offline_quant
 
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -82,7 +82,13 @@
 
 from paddle.distributed.fleet.meta_parallel.zero_bubble_utils import WeightGradStore
 
-from ..fp8_utils import FP8KeepXLinear, FP8Linear, FP8Mlp, FP8LinearFunctionBase, cache_fp8_weight
+from ..fp8_utils import (
+    FP8KeepXLinear,
+    FP8Linear,
+    FP8LinearFunctionBase,
+    FP8Mlp,
+    cache_fp8_weight,
+)
 from .fp8_linear import Linear
 
 DSV3_USE_FP8_GEMM = os.getenv("DSV3_USE_FP8_GEMM", "False").lower() == "true"
@@ -961,9 +967,10 @@ def __init__(self, config: DeepseekV2Config, norm_weight=None, norm_eps=None):
             using_post_norm_recompute=self.using_post_norm_recompute,
         )
 
-        # moe_grad_group = fleet.get_hybrid_communicate_group().expert_grad_comm_group
-        # for p in self.experts.parameters():
-        #     setattr(p, "color", {"color": "moe_expert", "group": moe_grad_group})
+        if config.offline_quant_expert_weight and config.clear_origin_weight_when_offline_quant:
+            moe_grad_group = fleet.get_hybrid_communicate_group().expert_grad_comm_group
+            for p in self.experts.parameters():
+                setattr(p, "color", {"color": "moe_expert", "group": moe_grad_group})
 
         self.alpha = config.aux_loss_alpha
         if config.n_shared_experts is not None:
@@ -995,7 +1002,7 @@ def quantize_weights(weight_list, weight_obj=None):
             """Helper function to quantize a list of weights."""
             if weight_obj is None:
                 weight_obj = weight_list[0]
-            if hasattr( weight_obj, "fp8_weight_stacked"):
+            if hasattr(weight_obj, "fp8_weight_stacked"):
                 return
 
             # Quantize without transpose
@@ -1027,7 +1034,7 @@ def quantize_weights(weight_list, weight_obj=None):
                 if expert is not None:
                     quantize_weights([expert.w1])
                     quantize_weights([expert.w1])
-        
+
         if self.config.n_shared_experts is not None:
             self.shared_experts.fp8_quant_weight()
 
@@ -1194,18 +1201,22 @@ def forward(
 
         bsz = q_init.shape[0]
         q_ln_t, q_ln_invar = fused_ln.fused_rms_norm(q_init, q_ln_weight, eps)
-        #q = paddle.matmul(q_ln_t, q_up_weight)
+        # q = paddle.matmul(q_ln_t, q_up_weight)
         q_orig_shape = q_ln_t.shape
-        q = FP8LinearFunctionBase.compute_fp8_linear(q_ln_t.reshape([-1, q_orig_shape[-1]]), q_up_weight, weight_transpose=True, return_transpose_only=True)
-        q = q.reshape( q_orig_shape[:-1] + [q_up_weight.shape[-1]])
+        q = FP8LinearFunctionBase.compute_fp8_linear(
+            q_ln_t.reshape([-1, q_orig_shape[-1]]), q_up_weight, weight_transpose=True, return_transpose_only=True
+        )
+        q = q.reshape(q_orig_shape[:-1] + [q_up_weight.shape[-1]])
 
         compressed_kv, k_pe = paddle.split(kv_init, [kv_lora_rank, qk_rope_head_dim], axis=-1)
 
         kv_ln_t, kv_ln_invar = fused_ln.fused_rms_norm(compressed_kv, kv_ln_weight, eps)
-        #kv = paddle.matmul(kv_ln_t, kv_up_weight)
+        # kv = paddle.matmul(kv_ln_t, kv_up_weight)
         kv_orig_shape = kv_ln_t.shape
-        kv = FP8LinearFunctionBase.compute_fp8_linear(kv_ln_t.reshape([-1, kv_orig_shape[-1]]), kv_up_weight, weight_transpose=True, return_transpose_only=True)
-        kv = kv.reshape( kv_orig_shape[:-1] + [kv_up_weight.shape[-1]])
+        kv = FP8LinearFunctionBase.compute_fp8_linear(
+            kv_ln_t.reshape([-1, kv_orig_shape[-1]]), kv_up_weight, weight_transpose=True, return_transpose_only=True
+        )
+        kv = kv.reshape(kv_orig_shape[:-1] + [kv_up_weight.shape[-1]])
 
         query_states, key_states, value_states = qkv_pre_process(
             q,
@@ -1366,25 +1377,34 @@ def backward(ctx, dout):
 
         q_ln_t, q_ln_invar = fused_ln.fused_rms_norm(q_init, q_ln_weight, eps)
 
-
         q_ln_fp8, q_ln_scale, q_ln_trans_fp8, q_ln_trans_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                q_ln_t.reshape([-1, q_ln_t.shape[-1]]), output_scale_transpose=True, 
-                quant_method="1x128", input_transpose=True )
-        
+            q_ln_t.reshape([-1, q_ln_t.shape[-1]]),
+            output_scale_transpose=True,
+            quant_method="1x128",
+            input_transpose=True,
+        )
+
         q_orig_shape = q_ln_t.shape
-        q = FP8LinearFunctionBase.compute_fp8_linear((q_ln_fp8, q_ln_scale), q_up_weight, weight_transpose=True, return_transpose_only=True)
-        q = q.reshape( q_orig_shape[:-1] + [q_up_weight.shape[-1]])
+        q = FP8LinearFunctionBase.compute_fp8_linear(
+            (q_ln_fp8, q_ln_scale), q_up_weight, weight_transpose=True, return_transpose_only=True
+        )
+        q = q.reshape(q_orig_shape[:-1] + [q_up_weight.shape[-1]])
 
         compressed_kv, k_pe = paddle.split(kv_init, [kv_lora_rank, qk_rope_head_dim], axis=-1)
 
         kv_ln_t, kv_ln_invar = fused_ln.fused_rms_norm(compressed_kv, kv_ln_weight, eps)
-        
+
         kv_ln_fp8, kv_ln_scale, kv_ln_trans_fp8, kv_ln_trans_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                     kv_ln_t.reshape([-1, kv_ln_t.shape[-1]]), output_scale_transpose=True, 
-                     quant_method="1x128", input_transpose=True )
+            kv_ln_t.reshape([-1, kv_ln_t.shape[-1]]),
+            output_scale_transpose=True,
+            quant_method="1x128",
+            input_transpose=True,
+        )
         kv_orig_shape = kv_ln_t.shape
-        kv = FP8LinearFunctionBase.compute_fp8_linear((kv_ln_fp8, kv_ln_scale), kv_up_weight, weight_transpose=True, return_transpose_only=True)
-        kv = kv.reshape( kv_orig_shape[:-1] + [kv_up_weight.shape[-1]])
+        kv = FP8LinearFunctionBase.compute_fp8_linear(
+            (kv_ln_fp8, kv_ln_scale), kv_up_weight, weight_transpose=True, return_transpose_only=True
+        )
+        kv = kv.reshape(kv_orig_shape[:-1] + [kv_up_weight.shape[-1]])
 
         paddle.base.core._set_has_grad(True)
         q.stop_gradient = False
@@ -1465,11 +1485,16 @@ def backward(ctx, dout):
         # call up proj
         if hasattr(kv_up_weight, "main_grad"):
             d_kv_fp8, d_kv_scale, d_kv_t_fp8, d_kv_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                     d_kv.reshape([-1, d_kv.shape[-1]]), output_scale_transpose=True, 
-                     quant_method="1x128", input_transpose=True )
+                d_kv.reshape([-1, d_kv.shape[-1]]),
+                output_scale_transpose=True,
+                quant_method="1x128",
+                input_transpose=True,
+            )
 
-            d_kv_ln_t = FP8LinearFunctionBase.compute_fp8_linear((d_kv_fp8, d_kv_scale), kv_up_weight, weight_transpose=False)
-            d_kv_ln_t = d_kv_ln_t.reshape( d_kv.shape[:-1] + [kv_up_weight.shape[0]])
+            d_kv_ln_t = FP8LinearFunctionBase.compute_fp8_linear(
+                (d_kv_fp8, d_kv_scale), kv_up_weight, weight_transpose=False
+            )
+            d_kv_ln_t = d_kv_ln_t.reshape(d_kv.shape[:-1] + [kv_up_weight.shape[0]])
 
             def kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_scale, kv_up_weight):
                 FP8LinearFunctionBase.kitchen_gemm(
@@ -1480,11 +1505,16 @@ def kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_sca
                     True,
                     True,
                     kv_up_weight.main_grad,
-                    paddle.float32 )
-              
+                    paddle.float32,
+                )
+
             if WeightGradStore.enabled:
-            
-                WeightGradStore.put(partial(kv_up_weight_grad, kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_scale, kv_up_weight))
+
+                WeightGradStore.put(
+                    partial(
+                        kv_up_weight_grad, kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_scale, kv_up_weight
+                    )
+                )
             else:
                 kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_scale, kv_up_weight)
 
@@ -1493,7 +1523,6 @@ def kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_sca
         else:
             d_kv_ln_t, d_kv_up_weight = _C_ops.matmul_grad(kv_ln_t, kv_up_weight, d_kv, False, False)
 
-
         d_compressed_kv, d_kv_ln_weight = fused_ln.fused_rms_norm_grad_func(
             compressed_kv, kv_ln_weight, kv_ln_invar, d_kv_ln_t, eps
         )
@@ -1503,15 +1532,19 @@ def kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_sca
         if hasattr(q_up_weight, "main_grad"):
 
             d_q_fp8, d_q_scale, d_q_t_fp8, d_q_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                    d_q.reshape([-1, d_q.shape[-1]]), output_scale_transpose=True, 
-                    quant_method="1x128", input_transpose=True )
-            #d_q_ln_t = paddle.matmul(d_q, q_up_weight, transpose_y=True)
+                d_q.reshape([-1, d_q.shape[-1]]),
+                output_scale_transpose=True,
+                quant_method="1x128",
+                input_transpose=True,
+            )
+            # d_q_ln_t = paddle.matmul(d_q, q_up_weight, transpose_y=True)
 
-            d_q_ln_t = FP8LinearFunctionBase.compute_fp8_linear((d_q_fp8, d_q_scale), q_up_weight, weight_transpose=False)
-            d_q_ln_t = d_q_ln_t.reshape( d_q.shape[:-1] + [q_up_weight.shape[0]])
+            d_q_ln_t = FP8LinearFunctionBase.compute_fp8_linear(
+                (d_q_fp8, d_q_scale), q_up_weight, weight_transpose=False
+            )
+            d_q_ln_t = d_q_ln_t.reshape(d_q.shape[:-1] + [q_up_weight.shape[0]])
 
-            
-            def q_up_weight_grad(q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q_up_weight): 
+            def q_up_weight_grad(q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q_up_weight):
                 FP8LinearFunctionBase.kitchen_gemm(
                     q_ln_trans_fp8,
                     q_ln_trans_scale,
@@ -1520,11 +1553,13 @@ def q_up_weight_grad(q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q
                     True,
                     True,
                     q_up_weight.main_grad,
-                    paddle.float32 )
-           
+                    paddle.float32,
+                )
 
-            if WeightGradStore.enabled:            
-                WeightGradStore.put(partial(q_up_weight_grad, q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q_up_weight))
+            if WeightGradStore.enabled:
+                WeightGradStore.put(
+                    partial(q_up_weight_grad, q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q_up_weight)
+                )
             else:
                 q_up_weight_grad(q_ln_trans_fp8, q_ln_trans_scale, d_q_t_fp8, d_q_t_scale, q_up_weight)
 
@@ -1605,17 +1640,16 @@ def __init__(
         )
 
     def fp8_quant_weight(self):
-        cache_fp8_weight( self.q_up_weight)
-        cache_fp8_weight( self.kv_up_weight)
+        cache_fp8_weight(self.q_up_weight)
+        cache_fp8_weight(self.kv_up_weight)
 
     def forward(self, q_init, kv_init, position_ids):
-        
+
         seq_len = q_init.shape[1]
 
         if self.rotary_emb.max_seq_len_cached is None or seq_len > self.rotary_emb.max_seq_len_cached:
             self.rotary_emb._set_cos_sin_cache(seq_len)
 
-
         return MemroyRecomputeAttnFunc.apply(
             q_init,
             kv_init,
@@ -1641,18 +1675,19 @@ class FusedRMSLinearFunc(paddle.autograd.PyLayer):
     def forward(ctx, x, rms_norm_weight, q_down_weight, kv_down_weight, eps):
 
         hidden_states, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
-        
+
         h_fp8, h_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-            hidden_states.reshape([-1, hidden_states.shape[-1]]), output_scale_transpose=True, 
-            quant_method="1x128" )
+            hidden_states.reshape([-1, hidden_states.shape[-1]]), output_scale_transpose=True, quant_method="1x128"
+        )
 
         h_orig_shape = hidden_states.shape
-        q = FP8LinearFunctionBase.compute_fp8_linear((h_fp8, h_scale), q_down_weight, weight_transpose=True, return_transpose_only=True)
-        q = q.reshape( h_orig_shape[:-1] + [q_down_weight.shape[-1]])
-
+        q = FP8LinearFunctionBase.compute_fp8_linear(
+            (h_fp8, h_scale), q_down_weight, weight_transpose=True, return_transpose_only=True
+        )
+        q = q.reshape(h_orig_shape[:-1] + [q_down_weight.shape[-1]])
 
         kv = paddle.matmul(hidden_states, kv_down_weight)
-    
+
         ctx.save_for_backward(x, rms_norm_weight, q_down_weight, kv_down_weight)
         ctx.eps = eps
         return q, kv
@@ -1662,35 +1697,39 @@ def backward(ctx, d_q, d_kv):
         x, rms_norm_weight, q_down_weight, kv_down_weight = ctx.saved_tensor()
         eps = ctx.eps
         hidden_states, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
-        
+
         h_t_fp8, h_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                hidden_states.reshape([-1, hidden_states.shape[-1]]), output_scale_transpose=True, 
-                quant_method="1x128", input_transpose=True, return_transpose_only=True )
+            hidden_states.reshape([-1, hidden_states.shape[-1]]),
+            output_scale_transpose=True,
+            quant_method="1x128",
+            input_transpose=True,
+            return_transpose_only=True,
+        )
 
         h_grad, d_kv_down_weight = _C_ops.matmul_grad(hidden_states, kv_down_weight, d_kv, False, False)
 
         if hasattr(q_down_weight, "main_grad"):
             d_q_fp8, d_q_scale, d_q_t_fp8, d_q_t_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
-                    d_q.reshape([-1, d_q.shape[-1]]), output_scale_transpose=True, 
-                    quant_method="1x128", input_transpose=True )        
-            FP8LinearFunctionBase.compute_fp8_linear((d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False, out=h_grad.view( [-1, h_grad.shape[-1]]))
-
+                d_q.reshape([-1, d_q.shape[-1]]),
+                output_scale_transpose=True,
+                quant_method="1x128",
+                input_transpose=True,
+            )
+            FP8LinearFunctionBase.compute_fp8_linear(
+                (d_q_fp8, d_q_scale), q_down_weight, weight_transpose=False, out=h_grad.view([-1, h_grad.shape[-1]])
+            )
 
-            def q_down_weight_grad(h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight): 
+            def q_down_weight_grad(h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight):
                 FP8LinearFunctionBase.kitchen_gemm(
-                    h_t_fp8,
-                    h_t_scale,
-                    d_q_t_fp8,
-                    d_q_t_scale,
-                    True,
-                    True,
-                    q_down_weight.main_grad,
-                    paddle.float32 )
-            
-            if WeightGradStore.enabled:            
-                WeightGradStore.put(partial(q_down_weight_grad, h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight))
+                    h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, True, True, q_down_weight.main_grad, paddle.float32
+                )
+
+            if WeightGradStore.enabled:
+                WeightGradStore.put(
+                    partial(q_down_weight_grad, h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight)
+                )
             else:
-                q_down_weight_grad( h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight)
+                q_down_weight_grad(h_t_fp8, h_t_scale, d_q_t_fp8, d_q_t_scale, q_down_weight)
 
             d_q_down_weight = None
 
@@ -1726,10 +1765,9 @@ def __init__(self, hidden_size, q_out_dim, kv_outdim, eps=1e-6) -> None:
             is_bias=False,
         )
         self.eps = eps
-    
-    def fp8_quant_weight(self):      
-        cache_fp8_weight( self.q_down_weight)
-        
+
+    def fp8_quant_weight(self):
+        cache_fp8_weight(self.q_down_weight)
 
     def forward(self, x):
 
@@ -1898,8 +1936,6 @@ def fp8_quant_weight(self):
             self.memory_recompute_att.fp8_quant_weight()
             self.fused_rms_norm_linear.fp8_quant_weight()
 
-
-
     def _init_rope(self):
         if self.config.rope_scaling is None:
             self.rotary_emb = DeepseekV2RotaryEmbedding(
diff --git a/paddlenlp/transformers/moe_layer.py b/paddlenlp/transformers/moe_layer.py
@@ -633,6 +633,7 @@ def forward(self, hidden_states_out, previous_event=None, async_finish=False, al
             allocate_on_comm_stream=allocate_on_comm_stream,
         )
         output_combine.stop_gradient = False
+        self.token_dispatcher._comm_manager.handle = None
         return output_combine
 
     @paddle.no_grad()

Original file line number	Diff line number	Diff line change
`@@ -633,6 +633,7 @@ def forward(self, hidden_states_out, previous_event=None, async_finish=False, al`
`633`	`633`	`allocate_on_comm_stream=allocate_on_comm_stream,`
`634`	`634`	`)`
`635`	`635`	`output_combine.stop_gradient = False`
	`636`	`+ self.token_dispatcher._comm_manager.handle = None`
`636`	`637`	`return output_combine`
`637`	`638`
`638`	`639`	`@paddle.no_grad()`