Support AWQ for vLLM (#292)

yunfeng-scale · web-flow · commit dfd75c9506b0 · 2023-09-26T09:49:23.000-07:00
diff --git a/model-engine/model_engine_server/domain/entities/llm_entity.py b/model-engine/model_engine_server/domain/entities/llm_entity.py
@@ -16,6 +16,7 @@ class LLMInferenceFramework(str, Enum):
 
 class Quantization(str, Enum):
     BITSANDBYTES = "bitsandbytes"
+    AWQ = "awq"
 
 
 @dataclass
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -227,6 +227,7 @@ async def create_model_bundle(
                     framework_image_tag,
                     endpoint_name,
                     num_shards,
+                    quantize,
                     checkpoint_path,
                 )
             elif framework == LLMInferenceFramework.LIGHTLLM:
@@ -453,6 +454,7 @@ async def create_vllm_bundle(
         framework_image_tag: str,
         endpoint_unique_name: str,
         num_shards: int,
+        quantize: Optional[Quantization],
         checkpoint_path: Optional[str],
     ):
         command = []
@@ -482,6 +484,12 @@ async def create_vllm_bundle(
             f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
         )
 
+        if quantize:
+            if quantize == Quantization.AWQ:
+                subcommands[-1] = subcommands[-1] + f" --quantization {quantize}"
+            else:
+                raise InvalidRequestException(f"Quantization {quantize} is not supported by vLLM.")
+
         command = [
             "/bin/bash",
             "-c",
diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt
@@ -1,3 +1,3 @@
 ray==2.6.3
-vllm==0.1.7
+git+https://github.com/vllm-project/vllm.git@7d7e3b78a3c265ab3c57eeff43af56f509907998#egg=vllm
 pydantic==1.10.12