add cli/env

shihaobai · shihaobai · commit 3d9c1bcfebe2 · 2024-12-10T13:46:35.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
@@ -10,7 +10,7 @@ def generate_scale_name(name):
     return weight_scale_name, input_scale_name
 
 
-QUANTED_WEIGHT = os.getenv("QUANTED_WEIGHT", "0").upper() in ["1", "TRUE", "ON"]
+STATIC_QUANT = os.getenv("STATIC_QUANT", "0").upper() in ["1", "TRUE", "ON"]
 
 
 class MMWeightTpl(BaseWeightTpl):
@@ -43,7 +43,7 @@ def mm(self, input_tensor, out=None, use_custom_tensor_mananger=True):
 
     def _post_load_weights(self):
         if self.quant_method is not None:
-            if QUANTED_WEIGHT:
+            if STATIC_QUANT:
                 if all(w is not None for w in [self.weight, self.weight_scale, self.input_scale]):
                     self.weight = self.quant_method.quantize((self.weight, self.weight_scale, self.input_scale))
             else:
@@ -86,11 +86,11 @@ def load_hf_weights(self, weights):
             bias = weights[self.bias_name].to(self.data_type_)[self.start : self.end]
             self.bias = bias.cuda(self.tp_rank_)
 
-        if QUANTED_WEIGHT and self.weight_scale_name in weights:
+        if STATIC_QUANT and self.weight_scale_name in weights:
             weight_scale = weights[self.weight_scale_name].to(torch.float)[self.start : self.end]
             self.weight_scale = weight_scale.cuda()
 
-        if QUANTED_WEIGHT and self.input_scale_name in weights:
+        if STATIC_QUANT and self.input_scale_name in weights:
             input_scale = weights[self.input_scale_name].to(torch.float)
             self.input_scale = input_scale.cuda()
 
@@ -122,11 +122,11 @@ def load_hf_weights(self, weights):
             bias = weights[self.bias_name]
             self.bias = (bias / self.world_size_).to(self.data_type_).cuda(self.tp_rank_)
 
-        if QUANTED_WEIGHT and self.weight_scale_name in weights:
+        if STATIC_QUANT and self.weight_scale_name in weights:
             weight_scale = weights[self.weight_scale_name].to(torch.float)
             self.weight_scale = weight_scale.cuda()
 
-        if QUANTED_WEIGHT and self.input_scale_name in weights:
+        if STATIC_QUANT and self.input_scale_name in weights:
             input_scale = weights[self.input_scale_name].to(torch.float)
             self.input_scale = input_scale.cuda()
 
@@ -203,10 +203,10 @@ def load_hf_weights(self, weights):
             if self.has_bias and self.bias_names[i] in weights:
                 bias = weights[self.bias_names[i]].to(self.data_type_)
                 self.biases[i] = bias[self.starts[i] : self.ends[i]]
-            if QUANTED_WEIGHT and self.weight_scale_names[i] in weights:
+            if STATIC_QUANT and self.weight_scale_names[i] in weights:
                 weight_scale = weights[self.weight_scale_names[i]][self.starts[i] : self.ends[i]]
                 self.weight_scales[i] = weight_scale.to(torch.float)
-            if QUANTED_WEIGHT and self.input_scale_names[i] in weights:
+            if STATIC_QUANT and self.input_scale_names[i] in weights:
                 input_scale = weights[self.input_scale_names[i]].to(torch.float)
                 self.input_scales[i] = input_scale
 
@@ -234,10 +234,10 @@ def load_hf_weights(self, weights):
             if self.has_bias and self.bias_names[i] in weights:
                 bias = weights[self.bias_names[i]].to(self.data_type_)
                 self.biases[i] = bias[:, self.starts[i] : self.ends[i]]
-            if QUANTED_WEIGHT and self.weight_scale_names[i] in weights:
+            if STATIC_QUANT and self.weight_scale_names[i] in weights:
                 weight_scale = weights[self.weight_scale_names[i]]
                 self.weight_scales[i] = weight_scale.to(torch.float)
-            if QUANTED_WEIGHT and self.input_scale_names[i] in weights:
+            if STATIC_QUANT and self.input_scale_names[i] in weights:
                 input_scale = weights[self.input_scale_names[i]].to(torch.float)
                 self.input_scales[i] = input_scale
         self._fuse()
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -1,6 +1,12 @@
+import os
 import argparse
 
 
+def push_env(args):
+    if args.static_quant:
+        os.environ["STATIC_QUANT"] = "1"
+
+
 def make_argument_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser()
 
@@ -244,4 +250,9 @@ def make_argument_parser() -> argparse.ArgumentParser:
         help="""Path of quantization config. It can be used for mixed quantization.
             Examples can be found in lightllm/common/quantization/configs.""",
     )
+    parser.add_argument(
+        "--static_quant",
+        action="store_true",
+        help="whether to load static quantized weights. Currently, only vllm-w8a8 is supported.",
+    )
     return parser
diff --git a/lightllm/server/api_server.py b/lightllm/server/api_server.py
@@ -37,7 +37,7 @@
 from fastapi import BackgroundTasks, FastAPI, Request, WebSocket, WebSocketDisconnect
 from fastapi.responses import Response, StreamingResponse, JSONResponse
 import uvicorn
-from .api_cli import make_argument_parser
+from .api_cli import make_argument_parser, push_env
 from .sampling_params import SamplingParams
 from .multimodal_params import MultimodalParams
 from .httpserver.manager import HttpServerManager
@@ -390,6 +390,7 @@ async def startup_event():
     torch.multiprocessing.set_start_method("spawn"),  # this code will not be ok for settings to fork to subprocess
     parser = make_argument_parser()
     args = parser.parse_args()
+    push_env(args)
     g_objs.args = args
     from .api_start import normal_or_p_d_start, pd_master_start
 
diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -50,6 +50,9 @@ def normal_or_p_d_start(g_objs):
         args.mem_fraction > 0 and args.mem_fraction < 1
     ), f"Invalid mem_fraction {args.mem_fraction}, The expected value is between 0 and 1."
 
+    if args.static_quant:
+        assert args.quant_type == "vllm-w8a8", "Only static parameter loading for vllm-w8a8 is supported."
+
     # splitfuse_mode 和 cuda_graph 不能同时开启
     if args.splitfuse_mode:
         assert args.disable_cudagraph