Static quant for vllm-w8a8 (#659)

shihaobai · web-flow · commit adccb9d6dd10 · 2024-12-10T15:10:19.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
@@ -1,15 +1,27 @@
+import os
 import torch
 from .base_weight import BaseWeightTpl
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 
 
+def generate_scale_name(name):
+    weight_scale_name = ".".join(name.split(".")[:-1] + ["weight_scale"])
+    input_scale_name = ".".join(name.split(".")[:-1] + ["input_scale"])
+    return weight_scale_name, input_scale_name
+
+
+STATIC_QUANT = os.getenv("STATIC_QUANT", "0").upper() in ["1", "TRUE", "ON"]
+
+
 class MMWeightTpl(BaseWeightTpl):
     def __init__(self, data_type):
         super().__init__()
         self.data_type_ = data_type
         self.quant_method = None
         self.weight = None
         self.bias = None
+        self.weight_scale = None
+        self.input_scale = None
 
     def set_quant_method(self, quant_method):
         self.quant_method = quant_method
@@ -31,7 +43,11 @@ def mm(self, input_tensor, out=None, use_custom_tensor_mananger=True):
 
     def _post_load_weights(self):
         if self.quant_method is not None:
-            self.weight = self.quant_method.quantize(self.weight.cuda(self.tp_rank_))
+            if STATIC_QUANT:
+                if all(w is not None for w in [self.weight, self.weight_scale, self.input_scale]):
+                    self.weight = self.quant_method.quantize((self.weight, self.weight_scale, self.input_scale))
+            else:
+                self.weight = self.quant_method.quantize(self.weight.to(self.data_type_).cuda(self.tp_rank_))
             return
         self.weight = self.weight.transpose(0, 1).cuda(self.tp_rank_)
 
@@ -43,6 +59,7 @@ def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
         self.end = split_n_embed * (self.tp_rank_ + 1)
         self.weight_name = weight_name
         self.bias_name = bias_name
+        self.weight_scale_name, self.input_scale_name = generate_scale_name(weight_name)
 
     def verify_load(self):
         load_ok = True
@@ -60,13 +77,24 @@ def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
 
     def load_hf_weights(self, weights):
         weight = None
+        weight_scale = None
+        input_scale = None
         if self.weight_name in weights:
-            weight = weights[self.weight_name].to(self.data_type_)
+            weight = weights[self.weight_name]
             self.weight = weight[self.start : self.end]
         if self.bias_name in weights:
             bias = weights[self.bias_name].to(self.data_type_)[self.start : self.end]
             self.bias = bias.cuda(self.tp_rank_)
-        if weight is None:
+
+        if STATIC_QUANT and self.weight_scale_name in weights:
+            weight_scale = weights[self.weight_scale_name].to(torch.float)[self.start : self.end]
+            self.weight_scale = weight_scale.cuda()
+
+        if STATIC_QUANT and self.input_scale_name in weights:
+            input_scale = weights[self.input_scale_name].to(torch.float)
+            self.input_scale = input_scale.cuda()
+
+        if weight is None and weight_scale is None and input_scale is None:
             return
         self._post_load_weights()
         return
@@ -85,13 +113,24 @@ def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
 
     def load_hf_weights(self, weights):
         weight = None
+        weight_scale = None
+        input_scale = None
         if self.weight_name in weights:
-            weight = weights[self.weight_name].to(self.data_type_)
+            weight = weights[self.weight_name]
             self.weight = weight[:, self.start : self.end]
         if self.bias_name in weights:
             bias = weights[self.bias_name]
             self.bias = (bias / self.world_size_).to(self.data_type_).cuda(self.tp_rank_)
-        if weight is None:
+
+        if STATIC_QUANT and self.weight_scale_name in weights:
+            weight_scale = weights[self.weight_scale_name].to(torch.float)
+            self.weight_scale = weight_scale.cuda()
+
+        if STATIC_QUANT and self.input_scale_name in weights:
+            input_scale = weights[self.input_scale_name].to(torch.float)
+            self.input_scale = input_scale.cuda()
+
+        if weight is None and weight_scale is None and input_scale is None:
             return
         self._post_load_weights()
         return
@@ -109,8 +148,17 @@ def __init__(self, weight_names, data_type, split_n_embeds, bias_names=[]):
         self.ends = [i * (self.tp_rank_ + 1) for i in self.split_n_embeds]
         self.weight_names = weight_names
         self.bias_names = bias_names
+        self.weight_scale_names = []
+        self.input_scale_names = []
+        for weight_name in weight_names:
+            weight_scale_name, input_scale_name = generate_scale_name(weight_name)
+            self.weight_scale_names.append(weight_scale_name)
+            self.input_scale_names.append(input_scale_name)
+
         self.weights = [None] * len(self.weight_names)
         self.biases = [None] * len(self.bias_names)
+        self.input_scales = [None] * len(self.weight_names)
+        self.weight_scales = [None] * len(self.weight_names)
         self.has_bias = all(b is not None for b in self.bias_names) and len(bias_names) > 0
 
     def verify_load(self):
@@ -131,6 +179,16 @@ def _fuse(self):
         if self.weight is None and all(w is not None for w in self.weights):
             self.weight = torch.cat(self.weights, dim=0)
             self._post_load_weights()
+
+        if self.weight_scale is None and all(w is not None for w in self.weight_scales):
+            self.weight_scale = torch.cat(self.weight_scales, dim=0).cuda()
+            self._post_load_weights()
+
+        if self.input_scale is None and all(w is not None for w in self.input_scales):
+            input_scales = torch.stack(self.input_scales, dim=0)
+            self.input_scale = torch.max(input_scales).cuda()
+            self._post_load_weights()
+
         if self.has_bias:
             if self.bias is None and all(b is not None for b in self.biases):
                 self.bias = torch.cat(self.biases, dim=0).cuda(self.tp_rank_)
@@ -140,11 +198,18 @@ def load_hf_weights(self, weights):
         weight = None
         for i in range(len(self.weight_names)):
             if self.weight_names[i] in weights:
-                weight = weights[self.weight_names[i]].to(self.data_type_)
+                weight = weights[self.weight_names[i]]
                 self.weights[i] = weight[self.starts[i] : self.ends[i]]
             if self.has_bias and self.bias_names[i] in weights:
                 bias = weights[self.bias_names[i]].to(self.data_type_)
                 self.biases[i] = bias[self.starts[i] : self.ends[i]]
+            if STATIC_QUANT and self.weight_scale_names[i] in weights:
+                weight_scale = weights[self.weight_scale_names[i]][self.starts[i] : self.ends[i]]
+                self.weight_scales[i] = weight_scale.to(torch.float)
+            if STATIC_QUANT and self.input_scale_names[i] in weights:
+                input_scale = weights[self.input_scale_names[i]].to(torch.float)
+                self.input_scales[i] = input_scale
+
         self._fuse()
         return
 
@@ -164,11 +229,17 @@ def load_hf_weights(self, weights):
         weight = None
         for i in range(len(self.weight_names)):
             if self.weight_names[i] in weights:
-                weight = weights[self.weight_names[i]].to(self.data_type_)
+                weight = weights[self.weight_names[i]]
                 self.weights[i] = weight[:, self.starts[i] : self.ends[i]]
             if self.has_bias and self.bias_names[i] in weights:
                 bias = weights[self.bias_names[i]].to(self.data_type_)
                 self.biases[i] = bias[:, self.starts[i] : self.ends[i]]
+            if STATIC_QUANT and self.weight_scale_names[i] in weights:
+                weight_scale = weights[self.weight_scale_names[i]]
+                self.weight_scales[i] = weight_scale.to(torch.float)
+            if STATIC_QUANT and self.input_scale_names[i] in weights:
+                input_scale = weights[self.input_scale_names[i]].to(torch.float)
+                self.input_scales[i] = input_scale
         self._fuse()
         return
 
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -31,23 +31,31 @@ def __init__(self):
         super().__init__()
 
     def quantize(self, weight: torch.Tensor):
-        if hasattr(weight, "scale"):
-            return weight.data.transpose(0, 1).cuda(), weight.scale.cuda()
+        if isinstance(weight, tuple):
+            return (weight[0].transpose(0, 1).cuda(),) + weight[1:]
         weight = weight.float()
         scale = weight.abs().max(dim=-1)[0] / 127
         weight = weight.transpose(0, 1) / scale.reshape(1, -1)
         weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
         return weight.cuda(), scale.cuda()
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None):
-        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=None, azp=None, symmetric=True)
+        input_scale = None
+        if len(weights) == 3:
+            qweight, weight_scale, input_scale = weights
+        elif len(weights) == 2:
+            qweight, weight_scale = weights
+        else:
+            raise ValueError("vllm-quant Weights must be a tuple of length 2 or 3.")
+
+        x_q, x_scale, x_zp = ops.scaled_int8_quant(input_tensor, scale=input_scale, azp=None, symmetric=True)
         m = input_tensor.shape[0]
-        n = weights[0].shape[1]
+        n = qweight.shape[1]
         if out is None:
             out = g_cache_manager.alloc_tensor(
                 (m, n), input_tensor.dtype, device=input_tensor.device, is_graph_out=False
             )
-        torch.ops._C.cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        torch.ops._C.cutlass_scaled_mm(out, x_q, qweight, x_scale, weight_scale, bias)
         return out
 
 
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -244,4 +244,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
         help="""Path of quantization config. It can be used for mixed quantization.
             Examples can be found in lightllm/common/quantization/configs.""",
     )
+    parser.add_argument(
+        "--static_quant",
+        action="store_true",
+        help="whether to load static quantized weights. Currently, only vllm-w8a8 is supported.",
+    )
     return parser
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -17,6 +17,9 @@
 
 logger = init_logger(__name__)
 
+def set_env(args):
+    if args.static_quant:
+        os.environ["STATIC_QUANT"] = "1"
 
 def normal_or_p_d_start(g_objs):
     from .api_server import G_Objs
@@ -45,11 +48,16 @@ def normal_or_p_d_start(g_objs):
 
     logger.info(f"use tgi api: {args.use_tgi_api}")
 
+    set_env(args)
+
     assert not (args.beam_mode and args.use_dynamic_prompt_cache), "Beam mode incompatible with dynamic prompt cache"
     assert (
         args.mem_fraction > 0 and args.mem_fraction < 1
     ), f"Invalid mem_fraction {args.mem_fraction}, The expected value is between 0 and 1."
 
+    if args.static_quant:
+        assert args.quant_type == "vllm-w8a8", "Only static parameter loading for vllm-w8a8 is supported."
+
     # splitfuse_mode 和 cuda_graph 不能同时开启
     if args.splitfuse_mode:
         assert args.disable_cudagraph