Skip to content

Commit dfd75c9

Browse files
Support AWQ for vLLM (#292)
1 parent 10385eb commit dfd75c9

File tree

3 files changed

+10
-1
lines changed

3 files changed

+10
-1
lines changed

model-engine/model_engine_server/domain/entities/llm_entity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class LLMInferenceFramework(str, Enum):
1616

1717
class Quantization(str, Enum):
1818
BITSANDBYTES = "bitsandbytes"
19+
AWQ = "awq"
1920

2021

2122
@dataclass

model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ async def create_model_bundle(
227227
framework_image_tag,
228228
endpoint_name,
229229
num_shards,
230+
quantize,
230231
checkpoint_path,
231232
)
232233
elif framework == LLMInferenceFramework.LIGHTLLM:
@@ -453,6 +454,7 @@ async def create_vllm_bundle(
453454
framework_image_tag: str,
454455
endpoint_unique_name: str,
455456
num_shards: int,
457+
quantize: Optional[Quantization],
456458
checkpoint_path: Optional[str],
457459
):
458460
command = []
@@ -482,6 +484,12 @@ async def create_vllm_bundle(
482484
f"python -m vllm_server --model {final_weights_folder} --tensor-parallel-size {num_shards} --port 5005 --max-num-batched-tokens {max_num_batched_tokens}"
483485
)
484486

487+
if quantize:
488+
if quantize == Quantization.AWQ:
489+
subcommands[-1] = subcommands[-1] + f" --quantization {quantize}"
490+
else:
491+
raise InvalidRequestException(f"Quantization {quantize} is not supported by vLLM.")
492+
485493
command = [
486494
"/bin/bash",
487495
"-c",
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
ray==2.6.3
2-
vllm==0.1.7
2+
git+https://github.com/vllm-project/vllm.git@7d7e3b78a3c265ab3c57eeff43af56f509907998#egg=vllm
33
pydantic==1.10.12

0 commit comments

Comments
 (0)