ModelTC
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 3 deletions b/‎lightllm/common/fused_moe/topk_select.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎lightllm/common/quantization/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎lightllm/common/quantization/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lightllm/common/quantization/ppl_quant.py‎
Lines changed: 0 additions & 90 deletions b/‎lightllm/common/quantization/ppl_quant.py‎
Lines changed: 0 additions & 90 deletions
diff --git a/‎…ghtllm/common/quantization/vllm_quant.py‎ ‎…ghtllm/common/quantization/w8a8_quant.py‎lightllm/common/quantization/vllm_quant.py renamed to lightllm/common/quantization/w8a8_quant.py
Lines changed: 18 additions & 20 deletions b/‎…ghtllm/common/quantization/vllm_quant.py‎ ‎…ghtllm/common/quantization/w8a8_quant.py‎lightllm/common/quantization/vllm_quant.py renamed to lightllm/common/quantization/w8a8_quant.py
Lines changed: 18 additions & 20 deletions
diff --git a/‎lightllm/common/vllm_kernel/__init__.py‎ b/‎lightllm/common/vllm_kernel/__init__.py‎
diff --git a/‎lightllm/common/vllm_kernel/_custom_ops.py‎
Lines changed: 0 additions & 30 deletions b/‎lightllm/common/vllm_kernel/_custom_ops.py‎
Lines changed: 0 additions & 30 deletions
@@ -23,7 +23,7 @@
 import triton.language as tl
 from typing import Any, Callable, Dict, Optional, Tuple
 from lightllm.utils.log_utils import init_logger
-from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm.utils.vllm_utils import vllm_ops
 from lightllm.utils.device_utils import (
     get_device_sm_count,
     get_device_sm_regs_num,
@@ -446,7 +446,7 @@ def grouped_matmul(
     if use_fp8_w8a8:
         # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
         if block_size_k == 0:
-            token_inputs, token_input_scale = ops.scaled_fp8_quant(token_inputs, token_input_scale)
+            token_inputs, token_input_scale = vllm_ops.scaled_fp8_quant(token_inputs, token_input_scale)
         else:
             _m, _k = token_inputs.shape
             assert _k % block_size_k == 0
 
@@ -19,7 +19,8 @@
 
 import os
 import torch
-from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm.utils.sgl_utils import sgl_ops
+from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
@@ -32,14 +33,18 @@ def fused_topk(
     renormalize: bool,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert (
+        sgl_ops is not None
+    ), "sgl_kernel is not installed, you can't use the cuda fused_topk. \
+                    You can solve it by running `pip install sgl_kernel`."
 
     M, _ = hidden_states.shape
 
     topk_weights = torch.empty(M, topk, dtype=torch.float32, device=hidden_states.device)
     topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
     token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
 
-    ops.topk_softmax(
+    sgl_ops.topk_softmax(
         topk_weights,
         topk_ids,
         token_expert_indicies,
@@ -142,14 +147,16 @@ def cuda_grouped_topk(
 ):
 
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert light_ops is not None, "lightllm_kernel is not installed."
+
     num_tokens = gating_output.shape[0]
     topk_weights = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.float32)
     topk_indices = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.int32)
     token_expert_indices = torch.empty(num_tokens, topk_group, device=hidden_states.device, dtype=torch.int32)
     group_scores = torch.empty(num_tokens, num_expert_group, device=hidden_states.device, dtype=torch.float32)
     if correction_bias is None:
         correction_bias = torch.zeros_like(gating_output, dtype=torch.float32)
-    ops.grouped_topk(
+    light_ops.grouped_topk(
         topk_weights,
         correction_bias,
         topk_indices,
 
@@ -1,9 +1,8 @@
 import yaml
 import collections
 from .registry import QUANTMETHODS
-from .ppl_quant import *
 from .torchao_quant import *
-from .vllm_quant import *
+from .w8a8_quant import *
 from .triton_quant.triton_quant import *
 from .deepgemm_quant import *
 from lightllm.utils.log_utils import init_logger
 
@@ -5,18 +5,14 @@
 import torch.nn.functional as F
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
+from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
+from lightllm.utils.sgl_utils import HAS_SGL_KERNEL, sgl_ops
 
-try:
-    HAS_VLLM = True
-    from lightllm.common.vllm_kernel import _custom_ops as ops
-except:
-    HAS_VLLM = False
 
-
-class vLLMBaseQuantizationMethod(QuantizationMethod):
+class BaseQuantizationMethod(QuantizationMethod):
     def __init__(self):
         super().__init__()
-        assert HAS_VLLM, "vllm is not installed, you can't use quant api of it"
+        assert HAS_VLLM and HAS_SGL_KERNEL, "vllm and sgl_kernel are not installed, you can't use quant api of them."
         from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
         self.cache_manager = g_cache_manager
@@ -30,8 +26,8 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None):
         pass
 
 
-@QUANTMETHODS.register(["vllm-w8a8"])
-class vLLMw8a8QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-w8a8", "w8a8"])
+class w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
 
@@ -53,7 +49,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         else:
             raise ValueError("vllm-quant Weights must be a tuple of length 2 or 3.")
 
-        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
+        x_q, x_scale, x_zp = vllm_ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
         n = qweight.shape[1]
         if out is None:
@@ -67,8 +63,8 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         return out
 
 
-@QUANTMETHODS.register(["vllm-fp8w8a8"])
-class vLLMFP8w8a8QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-fp8w8a8", "fp8w8a8"])
+class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.is_moe = False
@@ -88,15 +84,15 @@ def quantize(self, weight: torch.Tensor):
     def quantize_scaled_mm_fp8(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = ops.scaled_fp8_quant(
+        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
             weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
         return qweight.transpose(0, 1), weight_scale
 
     def quantize_pingpong_fp8(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = ops.scaled_fp8_quant(
+        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
             weight.contiguous().cuda(), scale=None, use_per_token_if_dynamic=False
         )
         return qweight.transpose(0, 1), weight_scale
@@ -107,7 +103,7 @@ def quantize_moe(self, weight):
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         for i in range(num_experts):
-            qweight, weight_scale = ops.scaled_fp8_quant(
+            qweight, weight_scale = vllm_ops.scaled_fp8_quant(
                 weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=False
             )
             qweights[i] = qweight
@@ -121,7 +117,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
     def apply_scaled_mm_fp8(
         self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
     ):
-        x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+        x_q, x_scale = vllm_ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
         if out is None:
@@ -137,7 +133,9 @@ def apply_scaled_mm_fp8(
     def apply_pingpong_fp8(
         self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True
     ):
-        x_q, x_scale = ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=False)
+        x_q, x_scale = vllm_ops.scaled_fp8_quant(
+            input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=False
+        )
         assert bias is None
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
@@ -154,8 +152,8 @@ def apply_pingpong_fp8(
         return cutlass_scaled_mm(x_q, weights[0], x_scale, weights[1], out)
 
 
-@QUANTMETHODS.register(["vllm-fp8w8a8-b128"])
-class vLLMFP8w8a8B128QuantizationMethod(vLLMBaseQuantizationMethod):
+@QUANTMETHODS.register(["vllm-fp8w8a8-b128, fp8w8a8-b128"])
+class FP8w8a8B128QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.block_size = 128