update fp8 quant

shihaobai · shihaobai · commit fba788d0e773 · 2025-08-19T16:00:03.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
@@ -4,7 +4,7 @@
 from typing import Optional, Tuple, List, Dict, Any
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id
 from .base_weight import BaseWeight
-from lightllm.common.fused_moe.grouped_fused_moe_ep import fused_experts_impl, masked_group_gemm, tma_aligned_quantize
+from lightllm.common.fused_moe.grouped_fused_moe_ep import fused_experts_impl, masked_group_gemm
 from lightllm.common.fused_moe.moe_silu_and_mul import silu_and_mul_fwd
 from lightllm.distributed import dist_group_manager
 from lightllm.common.fused_moe.topk_select import select_experts
@@ -228,9 +228,7 @@ def select_experts_and_quant_input(
         if w1.ndim == 3:
             block_size_k = w1.shape[2] // w1_scale.shape[2]
         assert block_size_k == 128, "block_size_k must be 128"
-        input_scale = torch.empty((M, K // block_size_k), dtype=torch.float32, device=hidden_states.device)
-        qinput_tensor = torch.empty((M, K), dtype=w1.dtype, device=hidden_states.device)
-        per_token_group_quant_fp8(hidden_states, block_size_k, qinput_tensor, input_scale)
+        input_scale, qinput_tensor = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w1.dtype)
         return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale)
 
     def dispatch(
@@ -340,7 +338,9 @@ def prefilled_group_gemm(
             silu_out = torch.empty((all_tokens, N // 2), device=device, dtype=hidden_dtype)
 
             silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out)
-            qsilu_out, qsilu_out_scale = tma_aligned_quantize(silu_out)
+            qsilu_out, qsilu_out_scale = per_token_group_quant_fp8(
+                silu_out, self.block_size, dtype=w1.dtype, column_major_scales=True, scale_tma_aligned=True
+            )
 
             # groupgemm (contiguous layout)
             gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype)
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -526,10 +526,9 @@ def grouped_matmul(
         else:
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
-            input_scale = alloc_tensor_func((_m, _k // block_size_k), dtype=torch.float32, device=token_inputs.device)
-            qinput_tensor = alloc_tensor_func((_m, _k), dtype=expert_weights.dtype, device=token_inputs.device)
-            per_token_group_quant_fp8(token_inputs, block_size_k, qinput_tensor, input_scale)
-            token_inputs, token_input_scale = qinput_tensor, input_scale
+            token_inputs, token_input_scale = per_token_group_quant_fp8(
+                token_inputs, block_size_k, dtype=expert_weights.dtype
+            )
 
     if reused_mblock_infos is None:
         mblocks_to_expert_id, mblocks_to_m_index = moe_align2(token_num_mul_topk_num, expert_to_token_num, BLOCK_SIZE_M)
@@ -627,13 +626,17 @@ def fused_experts_impl(
     CHUNK_SIZE = FFN_MOE_CHUNK_SIZE
     topk_num = topk_ids.shape[1]
     M = min(num_tokens, CHUNK_SIZE)
-    
-    intermediate_cache13_shared = alloc_tensor_func((M, topk_num, max(N, w2.shape[1])), device=hidden_states.device, dtype=hidden_states.dtype)
-    intermediate_cache1 = intermediate_cache13_shared.view(-1)[:(M * topk_num * N)].view(M, topk_num, N)
+
+    intermediate_cache13_shared = alloc_tensor_func(
+        (M, topk_num, max(N, w2.shape[1])), device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    intermediate_cache1 = intermediate_cache13_shared.view(-1)[: (M * topk_num * N)].view(M, topk_num, N)
     intermediate_cache2 = alloc_tensor_func(
         (M, topk_num, N // 2), device=hidden_states.device, dtype=hidden_states.dtype
     )
-    intermediate_cache3 = intermediate_cache13_shared.view(-1)[:(M * topk_num * w2.shape[1])].view(M, topk_num, w2.shape[1])
+    intermediate_cache3 = intermediate_cache13_shared.view(-1)[: (M * topk_num * w2.shape[1])].view(
+        M, topk_num, w2.shape[1]
+    )
 
     if inplace:
         out_hidden_states = hidden_states
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -26,17 +26,6 @@
     logger.warning("no deepep or deep_gemm")
 
 
-def tma_aligned_quantize(
-    input_tensor: torch.Tensor, block_size: int = 128, dtype: torch.dtype = torch.float8_e4m3fn
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    m, k = input_tensor.shape
-    input_scale = torch.empty((m, k // block_size), dtype=torch.float32, device=input_tensor.device)
-    qinput_tensor = torch.empty((m, k), dtype=dtype, device=input_tensor.device)
-    per_token_group_quant_fp8(input_tensor, block_size, qinput_tensor, input_scale)
-    input_scale = tma_align_input_scale(input_scale)
-    return qinput_tensor, input_scale
-
-
 def masked_group_gemm(
     recv_x: Tuple[torch.Tensor],
     masked_m: torch.Tensor,
@@ -106,9 +95,7 @@ def fused_experts_impl(
 
     combined_x = None
     if is_prefill:
-        input_scale = torch.empty((M, K // block_size_k), dtype=torch.float32, device=hidden_states.device)
-        qinput_tensor = torch.empty((M, K), dtype=w1.dtype, device=hidden_states.device)
-        per_token_group_quant_fp8(hidden_states, block_size_k, qinput_tensor, input_scale)
+        qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w1.dtype)
 
         # get_dispatch_layout
         (
@@ -186,7 +173,9 @@ def fused_experts_impl(
             silu_out = torch.empty((all_tokens, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
 
             silu_and_mul_fwd(gemm_out_a.view(-1, N), silu_out)
-            qsilu_out, qsilu_out_scale = tma_aligned_quantize(silu_out)
+            qsilu_out, qsilu_out_scale = per_token_group_quant_fp8(
+                silu_out, block_size_k, dtype=w1.dtype, column_major_scales=True, scale_tma_aligned=True
+            )
 
             # groupgemm (contiguous layout)
             gemm_out_b = torch.empty((all_tokens, K), device=hidden_states.device, dtype=hidden_states.dtype)
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
@@ -51,22 +51,22 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         else:
             qweight, weight_scale = weights
             input_scale = None
+        alloc_func = torch.empty
+        if use_custom_tensor_mananger:
+            alloc_func = self.cache_manager.alloc_tensor
         m, k = input_tensor.shape
         n = weights[0].shape[1]
         if input_scale is None:
-            qinput_tensor = self.cache_manager.alloc_tensor(
-                (m, k), qweight.dtype, device=qweight.device, is_graph_out=False
-            )
-            _, input_scale = per_token_group_quant_fp8(
-                input_tensor, self.block_size, qinput_tensor, column_major_scales=True, scale_tma_aligned=True
+            qinput_tensor, input_scale = per_token_group_quant_fp8(
+                input_tensor,
+                self.block_size,
+                dtype=qweight.dtype,
+                column_major_scales=True,
+                scale_tma_aligned=True,
+                alloc_func=alloc_func,
             )
 
         if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor(
-                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
-                )
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
         deep_gemm.gemm_fp8_fp8_bf16_nt([qinput_tensor, input_scale], [qweight.t(), weight_scale.t()], out)
         return out
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py
@@ -108,22 +108,24 @@ def lightllm_per_token_group_quant_fp8(
 def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
-    x_q: torch.Tensor,
-    x_s: torch.Tensor = None,
     eps: float = 1e-10,
     dtype: torch.dtype = torch.float8_e4m3fn,
     column_major_scales: bool = False,
     scale_tma_aligned: bool = False,
     alloc_func: Callable = torch.empty,
 ):
+    x_q = alloc_func(x.shape, device=x.device, dtype=dtype)
+    x_s = None
     # Adapted from
     # https://github.com/sgl-project/sglang/blob/7e257cd666c0d639626487987ea8e590da1e9395/python/sglang/srt/layers/quantization/fp8_kernel.py#L290
     if HAS_SGL_KERNEL:
         finfo = torch.finfo(dtype)
         fp8_max, fp8_min = finfo.max, finfo.min
+
+        # 创建scale张量
         if column_major_scales:
             if scale_tma_aligned:
-                # aligned to 4 * sizeof(float)
+                # 对齐到4 * sizeof(float)
                 aligned_size = (x.shape[-2] + 3) // 4 * 4
                 x_s = alloc_func(
                     x.shape[:-2] + (x.shape[-1] // group_size, aligned_size),
@@ -137,16 +139,24 @@ def per_token_group_quant_fp8(
                     dtype=torch.float32,
                 ).permute(-1, -2)
         else:
-            if x_s is None:
-                x_s = alloc_func(
-                    x.shape[:-1] + (x.shape[-1] // group_size,),
-                    device=x.device,
-                    dtype=torch.float32,
-                )
+            x_s = alloc_func(
+                x.shape[:-1] + (x.shape[-1] // group_size,),
+                device=x.device,
+                dtype=torch.float32,
+            )
+
+        # 使用SGL kernel进行量化
         sgl_ops.sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max, False)
     else:
+        # 使用LightLLM kernel进行量化
+        x_s = alloc_func(
+            x.shape[:-1] + (x.shape[-1] // group_size,),
+            device=x.device,
+            dtype=torch.float32,
+        )
         lightllm_per_token_group_quant_fp8(x, group_size, x_q, x_s, eps=1e-10, dtype=torch.float8_e4m3fn)
-
+        if column_major_scales and scale_tma_aligned:
+            x_s = tma_align_input_scale(x_s)
     return x_q, x_s
 
 
@@ -237,9 +247,9 @@ def test_tma_align():
     m = 576
     k = 8192
     x = torch.randn((m, k // 128), dtype=torch.float32).cuda()
+
     for _ in range(10):
         x_padded = tma_align_input_scale(x)
-    print(x_padded.shape)
     import time
 
     torch.cuda.synchronize()
@@ -255,11 +265,9 @@ def test_tma_align():
 def test_per_token_group_quant_fp8():
     group_size = 128
     x = torch.randn((1024, 8192), dtype=torch.bfloat16).cuda()
-
-    x_q = torch.randn((1024, 8192)).cuda().to(torch.float8_e4m3fn)
     # x_s = torch.randn((1024, 8192 // group_size), dtype=torch.float32).cuda()
     # x_s = torch.randn((8192 // group_size, 1024 + 10), dtype=torch.float32).cuda().t()
-    _, x_s = per_token_group_quant_fp8(x, group_size, x_q, None, column_major_scales=True)
+    x_q, x_s = per_token_group_quant_fp8(x, group_size, column_major_scales=True, scale_tma_aligned=True)
     x_s = x_s[:1024]
     th_x_q, th_x_s = torch_quant(x, group_size)
     print("th_x_s - x_s", torch.abs(th_x_s - x_s.reshape(-1)).max())
@@ -268,4 +276,4 @@ def test_per_token_group_quant_fp8():
 
 if __name__ == "__main__":
     test_per_token_group_quant_fp8()
-    # test_tma_align()
+    test_tma_align()
diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py
@@ -38,26 +38,20 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         qweight, weight_scale, input_scale = weights
         m, k = input_tensor.shape
         n = qweight.shape[1]
+        alloc_func = torch.empty
+        if use_custom_tensor_mananger:
+            alloc_func = self.cache_manager.alloc_tensor
         if input_scale is None:
-            input_scale = self.cache_manager.alloc_tensor(
-                (m, k // self.block_size), torch.float32, device=input_tensor.device, is_graph_out=False
+            input_tensor_q, input_scale = per_token_group_quant_fp8(
+                input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
             )
-            input_tensor_q = self.cache_manager.alloc_tensor(
-                (m, k), qweight.dtype, device=qweight.device, is_graph_out=False
-            )
-            per_token_group_quant_fp8(input_tensor, self.block_size, input_tensor_q, input_scale)
         else:
             # TODO
             raise "statci input scale is not supported by triton fp8 block gemm kernel."
         m = input_tensor.shape[0]
         n = qweight.shape[1]
         if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor(
-                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
-                )
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
         w8a8_block_fp8_matmul(
             input_tensor_q,
             qweight,
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -131,21 +131,15 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         qweight, weight_scale, input_scale = weights
         m, k = input_tensor.shape
         n = weights[0].shape[1]
+        alloc_func = torch.empty
+        if use_custom_tensor_mananger:
+            alloc_func = self.cache_manager.alloc_tensor
         if input_scale is None:
-            input_scale = self.cache_manager.alloc_tensor(
-                (m, k // self.block_size), torch.float32, device=input_tensor.device, is_graph_out=False
+            qinput_tensor, input_scale = per_token_group_quant_fp8(
+                input_tensor, self.block_size, dtype=qweight.dtype, alloc_func=alloc_func
             )
-            qinput_tensor = self.cache_manager.alloc_tensor(
-                (m, k), qweight.dtype, device=qweight.device, is_graph_out=False
-            )
-            per_token_group_quant_fp8(input_tensor, self.block_size, qinput_tensor, input_scale)
         if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor(
-                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
-                )
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+            out = alloc_func((m, n), input_tensor.dtype, device=input_tensor.device)
         if n % 128 != 0:
             w8a8_block_fp8_matmul(
                 qinput_tensor,
diff --git a/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py b/unit_tests/common/fused_moe/test_moe_silu_and_mul_mix_quant_ep.py
@@ -32,20 +32,15 @@ def test_silu_and_mul_masked(expert_num, token_num, hidden_dim):
     )
 
     true_out_tensor_mid = torch.randn((expert_num, token_num, hidden_dim // 2), dtype=torch.float16, device="cuda")
-    true_out_tensor = torch.empty((expert_num, token_num, hidden_dim // 2), dtype=torch.float8_e4m3fn, device="cuda")
-    true_out_scale_tensor = torch.randn(
-        (expert_num, token_num, hidden_dim // 2 // quant_group_size), dtype=torch.float32, device="cuda"
-    )
 
     masked_m = [random.randint(0, token_num) for _ in range(expert_num)]
     masked_m = torch.tensor(masked_m, dtype=torch.int32, device="cuda")
 
     silu_and_mul_fwd(in_tensor.view(-1, hidden_dim), true_out_tensor_mid.view(-1, hidden_dim // 2))
-    per_token_group_quant_fp8(
+    true_out_tensor, true_out_scale_tensor = per_token_group_quant_fp8(
         true_out_tensor_mid.view(-1, hidden_dim // 2),
         quant_group_size,
-        true_out_tensor.view(-1, hidden_dim // 2),
-        true_out_scale_tensor.view(-1, hidden_dim // 2 // quant_group_size),
+        alloc_func=torch.empty,
     )
 
     silu_and_mul_masked_post_quant_fwd(in_tensor, out_tensor, out_scale_tensor, quant_group_size, masked_m)