ModelTC
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 3 deletions b/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎lightllm/common/quantization/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎lightllm/common/quantization/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lightllm/common/quantization/ppl_quant.py‎
Lines changed: 0 additions & 90 deletions b/‎lightllm/common/quantization/ppl_quant.py‎
Lines changed: 0 additions & 90 deletions
diff --git a/‎lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py‎
Lines changed: 3 additions & 8 deletions b/‎lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎…ghtllm/common/quantization/vllm_quant.py‎ ‎…ghtllm/common/quantization/w8a8_quant.py‎lightllm/common/quantization/vllm_quant.py renamed to lightllm/common/quantization/w8a8_quant.py
Lines changed: 16 additions & 65 deletions b/‎…ghtllm/common/quantization/vllm_quant.py‎ ‎…ghtllm/common/quantization/w8a8_quant.py‎lightllm/common/quantization/vllm_quant.py renamed to lightllm/common/quantization/w8a8_quant.py
Lines changed: 16 additions & 65 deletions
diff --git a/‎lightllm/common/vllm_kernel/__init__.py‎ b/‎lightllm/common/vllm_kernel/__init__.py‎
diff --git a/‎lightllm/common/vllm_kernel/_custom_ops.py‎
Lines changed: 0 additions & 30 deletions b/‎lightllm/common/vllm_kernel/_custom_ops.py‎
Lines changed: 0 additions & 30 deletions
@@ -23,7 +23,7 @@
 import triton.language as tl
 from typing import Any, Callable, Dict, Optional, Tuple
 from lightllm.utils.log_utils import init_logger
-from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm.utils.vllm_utils import vllm_ops
 from lightllm.utils.device_utils import (
     get_device_sm_count,
     get_device_sm_regs_num,
@@ -446,7 +446,7 @@ def grouped_matmul(
     if use_fp8_w8a8:
         # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
         if block_size_k == 0:
-            token_inputs, token_input_scale = ops.scaled_fp8_quant(token_inputs, token_input_scale)
+            token_inputs, token_input_scale = vllm_ops.scaled_fp8_quant(token_inputs, token_input_scale)
         else:
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
 
@@ -19,7 +19,8 @@
 
 import os
 import torch
-from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm.utils.sgl_utils import sgl_ops
+from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
@@ -32,14 +33,18 @@ def fused_topk(
     renormalize: bool,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert (
+        sgl_ops is not None
+    ), "sgl_kernel is not installed, you can't use the cuda fused_topk. \
+                    You can solve it by running `pip install sgl_kernel`."
 
     M, _ = hidden_states.shape
 
     topk_weights = torch.empty(M, topk, dtype=torch.float32, device=hidden_states.device)
     topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
     token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
 
-    ops.topk_softmax(
+    sgl_ops.topk_softmax(
         topk_weights,
         topk_ids,
         token_expert_indicies,
@@ -142,14 +147,16 @@ def cuda_grouped_topk(
 ):
 
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert light_ops is not None, "lightllm_kernel is not installed."
+
     num_tokens = gating_output.shape[0]
     topk_weights = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.float32)
     topk_indices = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.int32)
     token_expert_indices = torch.empty(num_tokens, topk_group, device=hidden_states.device, dtype=torch.int32)
     group_scores = torch.empty(num_tokens, num_expert_group, device=hidden_states.device, dtype=torch.float32)
     if correction_bias is None:
         correction_bias = torch.zeros_like(gating_output, dtype=torch.float32)
-    ops.grouped_topk(
+    light_ops.grouped_topk(
         topk_weights,
         correction_bias,
         topk_indices,
 
@@ -1,9 +1,8 @@
 import yaml
 import collections
 from .registry import QUANTMETHODS
-from .ppl_quant import *
 from .torchao_quant import *
-from .vllm_quant import *
+from .w8a8_quant import *
 from .triton_quant.triton_quant import *
 from .deepgemm_quant import *
 from lightllm.utils.log_utils import init_logger
 
@@ -3,16 +3,11 @@
 import triton.language as tl
 
 from lightllm.common.kernel_config import KernelConfigs
+from lightllm.utils.sgl_utils import HAS_SGL_KERNEL, sgl_ops
 from frozendict import frozendict
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple
 
-try:
-    HAS_SGLANG_KERNEL = True
-    from sgl_kernel import sgl_per_token_group_quant_fp8
-except:
-    HAS_SGLANG_KERNEL = False
-
 try:
     from deep_gemm import ceil_div
 except:
@@ -118,10 +113,10 @@ def per_token_group_quant_fp8(
     eps: float = 1e-10,
     dtype: torch.dtype = torch.float8_e4m3fn,
 ):
-    if HAS_SGLANG_KERNEL:
+    if HAS_SGL_KERNEL:
         finfo = torch.finfo(dtype)
         fp8_max, fp8_min = finfo.max, finfo.min
-        sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max)
+        sgl_ops.sgl_per_token_group_quant_fp8(x, x_q, x_s, group_size, 1e-10, fp8_min, fp8_max)
     else:
         lightllm_per_token_group_quant_fp8(x, group_size, x_q, x_s, eps=1e-10, dtype=torch.float8_e4m3fn)
 
 
@@ -5,18 +5,13 @@
 import torch.nn.functional as F
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
+from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 
-try:
-    HAS_VLLM = True
-    from lightllm.common.vllm_kernel import _custom_ops as ops
-except:
-    HAS_VLLM = False
 
-
-class vLLMBaseQuantizationMethod(QuantizationMethod):
+class BaseQuantizationMethod(QuantizationMethod):
     def __init__(self):
         super().__init__()
-        assert HAS_VLLM, "vllm is not installed, you can't use quant api of it"
+        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
         from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
         self.cache_manager = g_cache_manager
@@ -30,8 +25,8 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None):
         pass
 
 
-@QUANTMETHODS.register(["vllm-w8a8"])
-class vLLMw8a8QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-w8a8", "w8a8"])
+class w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
 
@@ -53,7 +48,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         else:
             raise ValueError("vllm-quant Weights must be a tuple of length 2 or 3.")
 
-        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
+        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
         n = qweight.shape[1]
         if out is None:
@@ -63,51 +58,31 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        torch.ops._C.cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
 
 
-@QUANTMETHODS.register(["vllm-fp8w8a8"])
-class vLLMFP8w8a8QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"])
+class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.is_moe = False
-        # PINGPONG_FP8_GEMM is per tensor quant way.
-        self.use_pingpong_fp8_gemm = os.getenv("ENABLE_PINGPONG_FP8_GEMM", "0").upper() in ["ON", "TRUE", "1"]
-
-        if self.use_pingpong_fp8_gemm:
-            self.quantize = self.quantize_pingpong_fp8
-            self.apply = self.apply_pingpong_fp8
-        else:
-            self.quantize = self.quantize_scaled_mm_fp8
-            self.apply = self.apply_scaled_mm_fp8
 
     def quantize(self, weight: torch.Tensor):
-        raise Exception("This function needs to be bound.")
-
-    def quantize_scaled_mm_fp8(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = ops.scaled_fp8_quant(
+        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
             weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
         return qweight.transpose(0, 1), weight_scale
 
-    def quantize_pingpong_fp8(self, weight: torch.Tensor):
-        if self.is_moe:
-            return self.quantize_moe(weight)
-        qweight, weight_scale = ops.scaled_fp8_quant(
-            weight.contiguous().cuda(), scale=None, use_per_token_if_dynamic=False
-        )
-        return qweight.transpose(0, 1), weight_scale
-
     def quantize_moe(self, weight):
         num_experts = weight.shape[0]
         qweights = []
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         for i in range(num_experts):
-            qweight, weight_scale = ops.scaled_fp8_quant(
+            qweight, weight_scale = vllm_ops.scaled_fp8_quant(
                 weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=False
             )
             qweights[i] = qweight
@@ -116,12 +91,7 @@ def quantize_moe(self, weight):
         return qweights, weight_scale
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
-        raise Exception("This function needs to be bound.")
-
-    def apply_scaled_mm_fp8(
-        self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
-    ):
-        x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+        x_q, x_scale = vllm_ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
         if out is None:
@@ -131,31 +101,12 @@ def apply_scaled_mm_fp8(
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        torch.ops._C.cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
         return out
 
-    def apply_pingpong_fp8(
-        self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
-    ):
-        x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=False)
-        assert bias is None
-        m = input_tensor.shape[0]
-        n = weights[0].shape[1]
-        if out is None:
-            if use_custom_tensor_mananger:
-                out = self.cache_manager.alloc_tensor(
-                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
-                )
-            else:
-                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-
-        from fp8_pingpong_gemm import cutlass_scaled_mm
-
-        return cutlass_scaled_mm(x_q, weights[0], x_scale, weights[1], out)
-
 
-@QUANTMETHODS.register(["vllm-fp8w8a8-b128"])
-class vLLMFP8w8a8B128QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-fp8w8a8-b128, fp8w8a8-b128"])
+class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.block_size = 128
@@ -197,5 +148,5 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
             )
         else:
             input_scale = input_scale.t().contiguous().t()
-            torch.ops._C.cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
+            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
         return out