0529

SangChengC · SangChengC · commit 27cea3a1acdd · 2025-05-29T13:42:50.000+08:00
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -6,6 +6,15 @@
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.common.quantization.triton_quant.fp8.fp8w8a8_block_gemm_kernel import w8a8_block_fp8_matmul
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
+
+if not HAS_LIGHTLLM_KERNEL:
+
+    def scaled_fp8_quant(tensor, *args, **kwargs):
+        return light_ops.per_token_quant_bf16_fp8(tensor)
+
+else:
+    scaled_fp8_quant = vllm_ops.scaled_fp8_quant
 
 
 class BaseQuantizationMethod(QuantizationMethod):
@@ -71,7 +80,7 @@ def __init__(self):
     def quantize(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = vllm_ops.scaled_fp8_quant(
+        qweight, weight_scale = scaled_fp8_quant(
             weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
         )
         return qweight.transpose(0, 1), weight_scale
@@ -82,7 +91,7 @@ def quantize_moe(self, weight):
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         for i in range(num_experts):
-            qweight, weight_scale = vllm_ops.scaled_fp8_quant(
+            qweight, weight_scale = scaled_fp8_quant(
                 weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=False
             )
             qweights[i] = qweight
@@ -91,7 +100,7 @@ def quantize_moe(self, weight):
         return qweights, weight_scale
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
-        x_q, x_scale = vllm_ops.scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+        x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
         if out is None:
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -26,6 +26,7 @@
 from lightllm.distributed.communication_op import all_gather_into_tensor, reduce_scatter_tensor
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 logger = init_logger(__name__)
 
@@ -539,11 +540,9 @@ def _token_decode_attention_ppl_int8kv(self, q, infer_state: LlamaInferStateInfo
         calcu_shape1 = (batch_size, self.tp_q_head_num_, self.head_dim_)
         o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
 
-        from lightllm_ppl_kernel import group8_int8kv_decode_attention
-
         # group_int8kv_decode_attention(at::Tensor o, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,
         # at::Tensor v_s, at::Tensor b_loc, at::Tensor b_seq_len, int max_len_in_batch)
-        group8_int8kv_decode_attention(
+        light_ops.group8_int8kv_decode_attention(
             o_tensor.view(calcu_shape1),
             q.view(calcu_shape1),
             infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :],
diff --git a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding.py
@@ -1,4 +1,5 @@
 import torch
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 
 def token_decode_attention_flash_decoding(
@@ -18,7 +19,6 @@ def token_decode_attention_flash_decoding(
     max_len_in_batch = infer_state.max_len_in_batch
     calcu_shape1 = (batch_size, q_head_num, head_dim)
 
-    from lightllm_ppl_int8kv_flashdecoding_kernel import group8_int8kv_flashdecoding_stage1
     from .flash_decoding_stage2 import flash_decode_stage2
 
     o_tensor = alloc_tensor_func(q.shape, q.dtype, q.device) if out is None else out
@@ -30,7 +30,7 @@ def token_decode_attention_flash_decoding(
         [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 1], dtype=q.dtype, device="cuda"
     )
 
-    group8_int8kv_flashdecoding_stage1(
+    light_ops.group8_int8kv_flashdecoding_stage1(
         BLOCK_SEQ,
         mid_o,
         mid_o_logexpsum,