import vllm cutlass fp8w8a8-block gemm

shihaobai · shihaobai · commit ffaee7d87d80 · 2025-02-15T14:51:12.000+08:00
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -165,3 +165,40 @@ def apply_pingpong_fp8(
         from fp8_pingpong_gemm import cutlass_scaled_mm
 
         return cutlass_scaled_mm(x_q, weights[0], x_scale, weights[1], out)
+
+
+@QUANTMETHODS.register(["vllm-fp8w8a8-b128"])
+class vLLMFP8w8a8B128QuantizationMethod(vLLMBaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.blcok_size = 128
+
+    def quantize(self, weight: torch.Tensor):
+        if self.is_moe:
+            return self.quantize_moe(weight)
+        qweight, weight_scale = ops.scaled_fp8_quant(
+            weight.contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
+        )
+        return qweight.transpose(0, 1), weight_scale
+
+    def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
+        qweight, weight_scale, input_scale = weights
+        m, k = input_tensor.shape
+        n = weights[0].shape[1]
+        if input_scale is None:
+            input_scale = self.cache_manager.alloc_tensor(
+                (m, k // self.block_size), torch.float32, device=input_tensor.device, is_graph_out=False
+            )
+            qinput_tensor = self.cache_manager.alloc_tensor(
+                (m, k), qweight.dtype, device=qweight.device, is_graph_out=False
+            )
+            per_token_group_quant_fp8(input_tensor, self.block_size, qinput_tensor, input_scale)
+        if out is None:
+            if use_custom_tensor_mananger:
+                out = self.cache_manager.alloc_tensor(
+                    (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
+                )
+            else:
+                out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
+        torch.ops._C.cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
+        return out