update

shihaobai · shihaobai · commit 0adcb163c17d · 2025-05-13T16:30:45.000+08:00
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -5,14 +5,13 @@
 import torch.nn.functional as F
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
-from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops
-from lightllm.utils.sgl_utils import HAS_SGL_KERNEL, sgl_ops
+from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 
 
 class BaseQuantizationMethod(QuantizationMethod):
     def __init__(self):
         super().__init__()
-        assert HAS_VLLM and HAS_SGL_KERNEL, "vllm and sgl_kernel are not installed, you can't use quant api of them."
+        assert HAS_VLLM, "vllm are not installed, you can't use quant api of them."
         from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
         self.cache_manager = g_cache_manager
@@ -59,7 +58,7 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        torch.ops._C.cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
+        cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
 
 
@@ -127,7 +126,7 @@ def apply_scaled_mm_fp8(
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        torch.ops._C.cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
         return out
 
     def apply_pingpong_fp8(
@@ -195,5 +194,5 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
             )
         else:
             input_scale = input_scale.t().contiguous().t()
-            torch.ops._C.cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
+            cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
         return out
diff --git a/lightllm/distributed/custom_all_reduce.py b/lightllm/distributed/custom_all_reduce.py
@@ -33,7 +33,7 @@
 
 logger = init_logger(__name__)
 
-use_vllm_custom_allreduce = os.getenv("LIGHTLLM_USE_VLLM_CUSTOM_ALLREDUCE", "1").upper() in ["ON", "TRUE", "1"]
+use_vllm_custom_allreduce = os.getenv("LIGHTLLM_USE_VLLM_CUSTOM_ALLREDUCE", "0").upper() in ["ON", "TRUE", "1"]
 if use_vllm_custom_allreduce:
     # Use vllm custom allreduce
     ops = vllm_ops
diff --git a/lightllm/utils/vllm_utils.py b/lightllm/utils/vllm_utils.py
@@ -1,3 +1,4 @@
+import torch
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -6,9 +7,10 @@
 
     vllm_ops = ops
     HAS_VLLM = True
+    cutlass_scaled_mm = torch.ops._C.cutlass_scaled_mm
 except:
     HAS_VLLM = False
-    sgl_allreduce_ops = None
+    cutlass_scaled_mm = None
     logger.warning(
         "vllm is not installed, you can't use the api of it. \
                    You can solve it by running `pip install vllm`."