update vllmquant

shihaobai · shihaobai · commit 36d162af9c56 · 2024-12-10T14:34:39.000+08:00
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -29,28 +29,30 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None):
 class vLLMw8a8QuantizationMethod(vLLMBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
-        self.input_scale = None
 
     def quantize(self, weight: torch.Tensor):
         if isinstance(weight, tuple):
-            if len(weight) == 3:
-                self.input_scale = weight[-1]
-            return weight[0].transpose(0, 1).cuda(), weight[1]
+            return (weight[0].transpose(0, 1).cuda(),) + weight[1:]
         weight = weight.float()
         scale = weight.abs().max(dim=-1)[0] / 127
         weight = weight.transpose(0, 1) / scale.reshape(1, -1)
         weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
         return weight.cuda(), scale.cuda()
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None):
-        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=self.input_scale, azp=None, symmetric=True)
+        input_scale = None
+        if len(weights) == 3:
+            qweight, weight_scale, input_scale = weights
+        elif len(weights) == 2:
+            qweight, weight_scale = weights
+        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
-        n = weights[0].shape[1]
+        n = qweight.shape[1]
         if out is None:
             out = g_cache_manager.alloc_tensor(
                 (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
             )
-        torch.ops._C.cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        torch.ops._C.cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out