update

shihaobai · shihaobai · commit ceb4bc1f5a8f · 2025-05-13T16:59:33.000+08:00
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py
@@ -3,16 +3,11 @@
 import triton.language as tl
 
 from lightllm.common.kernel_config import KernelConfigs
+from lightllm.utils.sgl_utils import HAS_SGL_KERNEL, sgl_ops
 from frozendict import frozendict
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple
 
-try:
-    HAS_SGLANG_KERNEL = True
-    from sgl_kernel import sgl_per_token_group_quant_fp8
-except:
-    HAS_SGLANG_KERNEL = False
-
 try:
     from deep_gemm import ceil_div
 except:
@@ -118,10 +113,10 @@ def per_token_group_quant_fp8(
     eps: float = 1e-10,
     dtype: torch.dtype = torch.float8_e4m3fn,
 ):
-    if HAS_SGLANG_KERNEL:
+    if HAS_SGL_KERNEL:
         finfo = torch.finfo(dtype)
         fp8_max, fp8_min = finfo.max, finfo.min
-        sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max)
+        sgl_ops.sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max)
     else:
         lightllm_per_token_group_quant_fp8(x, group_size, x_q, x_s, eps=1e-10, dtype=torch.float8_e4m3fn)
 
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -67,35 +67,15 @@ class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.is_moe = False
-        # PINGPONG_FP8_GEMM is per tensor quant way.
-        self.use_pingpong_fp8_gemm = os.getenv("ENABLE_PINGPONG_FP8_GEMM", "0").upper() in ["ON", "TRUE", "1"]
-
-        if self.use_pingpong_fp8_gemm:
-            self.quantize = self.quantize_pingpong_fp8
-            self.apply = self.apply_pingpong_fp8
-        else:
-            self.quantize = self.quantize_scaled_mm_fp8
-            self.apply = self.apply_scaled_mm_fp8
 
     def quantize(self, weight: torch.Tensor):
-        raise Exception("This function needs to be bound.")
-
-    def quantize_scaled_mm_fp8(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
         qweight, weight_scale = vllm_ops.scaled_fp8_quant(
             weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
         return qweight.transpose(0, 1), weight_scale
 
-    def quantize_pingpong_fp8(self, weight: torch.Tensor):
-        if self.is_moe:
-            return self.quantize_moe(weight)
-        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
-            weight.contiguous().cuda(), scale=None, use_per_token_if_dynamic=False
-        )
-        return qweight.transpose(0, 1), weight_scale
-
     def quantize_moe(self, weight):
         num_experts = weight.shape[0]
         qweights = []
@@ -111,11 +91,6 @@ def quantize_moe(self, weight):
         return qweights, weight_scale
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
-        raise Exception("This function needs to be bound.")
-
-    def apply_scaled_mm_fp8(
-        self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
-    ):
         x_q, x_scale = vllm_ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
@@ -129,27 +104,6 @@ def apply_scaled_mm_fp8(
         cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
         return out
 
-    def apply_pingpong_fp8(
-        self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
-    ):
-        x_q, x_scale = vllm_ops.scaled_fp8_quant(
-            input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=False
-        )
-        assert bias is None
-        m = input_tensor.shape[0]
-        n = weights[0].shape[1]
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor(
-                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
-                )
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        from fp8_pingpong_gemm import cutlass_scaled_mm
-
-        return cutlass_scaled_mm(x_q, weights[0], x_scale, weights[1], out)
-
 
 @QUANTMETHODS.register(["vllm-fp8w8a8-b128, fp8w8a8-b128"])
 class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py
@@ -29,15 +29,10 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.dist_utils import get_global_world_size
 from lightllm.utils.log_utils import init_logger
+from lightllm.utils.sgl_utils import flash_attn_varlen_func, flash_attn_with_kvcache, merge_state_v2
 
 logger = init_logger(__name__)
 
-try:
-    from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
-    from sgl_kernel import merge_state_v2
-except:
-    logger.warning("sgl_kernel is not installed, or the installed version does not support fa3!")
-
 
 class Deepseek2TransformerLayerInfer(LlamaTransformerLayerInfer):
     def __init__(self, layer_num, network_config, mode=[]):
@@ -311,6 +306,7 @@ def _context_attention_flashattention_kernel_with_CC(
         layer_weight: Deepseek2TransformerLayerWeight,
         out=None,
     ) -> torch.Tensor:
+        assert flash_attn_varlen_func is not None, "fa3 is not available. It requires sm90 and above."
         k_nope, k_rope, v = self._decompress_kv(
             kv,
             infer_state,
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -29,10 +29,7 @@
 
 logger = init_logger(__name__)
 
-try:
-    from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
-except:
-    logger.warning("sgl_kernel is not installed, or the installed version does not support fa3!")
+from lightllm.utils.sgl_utils import flash_attn_with_kvcache
 
 
 class LlamaTransformerLayerInfer(TransformerLayerInferTpl):
@@ -252,6 +249,7 @@ def _context_attention_kernel_ppl_int8kv(
         return o_tensor
 
     def _context_attention_flashattention(self, q, kv, infer_state: LlamaInferStateInfo, layer_weight, out=None):
+        assert flash_attn_with_kvcache is not None, "fa3 is not available. It requires sm90 and above."
         cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
             -1, 1, self.tp_k_head_num_, self.head_dim_
         )
diff --git a/lightllm/utils/sgl_utils.py b/lightllm/utils/sgl_utils.py
@@ -3,9 +3,18 @@
 logger = init_logger(__name__)
 try:
     import sgl_kernel
-    import sgl_kernel.allreduce as sgl_allreduce_ops
 
     sgl_ops = sgl_kernel
+    sgl_allreduce_ops = sgl_ops.allreduce
+    if sgl_ops.flash_attn.is_fa3_supported():
+        flash_attn_varlen_func = sgl_ops.flash_attn.flash_attn_varlen_func
+        flash_attn_with_kvcache = sgl_ops.flash_attn.flash_attn_with_kvcache
+        merge_state_v2 = sgl_ops.flash_attn.merge_state_v2
+    else:
+        flash_attn_varlen_func = None
+        flash_attn_with_kvcache = None
+        merge_state_v2 = None
+        logger.warning("Fa3 is only supported on sm90 and above.")
     HAS_SGL_KERNEL = True
 except:
     HAS_SGL_KERNEL = False
diff --git a/test/model/model_infer.py b/test/model/model_infer.py
@@ -362,7 +362,7 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an
                         total_token_num,
                         b_ready_cache_len,
                     ),
-                    log_dir=f"./logs_sglang_4k/forward_prefill_{model_kvargs['rank_id']}",
+                    log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}",
                 )
             else:
                 torch_profile(

Original file line number	Diff line number	Diff line change
`@@ -362,7 +362,7 @@ def tppart_model_infer(args, model_kvargs, batch_size, input_len, output_len, an`
`362`	`362`	`total_token_num,`
`363`	`363`	`b_ready_cache_len,`
`364`	`364`	`),`
`365`		`- log_dir=f"./logs_sglang_4k/forward_prefill_{model_kvargs['rank_id']}",`
	`365`	`+ log_dir=f"./logs/forward_prefill_{model_kvargs['rank_id']}",`
`366`	`366`	`)`
`367`	`367`	`else:`
`368`	`368`	`torch_profile(`