Skip to content

Commit fc65129

Browse files
ksuma2109Suma Kasaxyang16
authored
Upgrade vllm to 0.11.0 and set vllm_aysnc_service.py as the default handler (#2918)
Co-authored-by: Suma Kasa <sumakasa@amazon.com> Co-authored-by: Xin Yang <xyangx@amazon.com>
1 parent e3ed832 commit fc65129

File tree

12 files changed

+73
-67
lines changed

12 files changed

+73
-67
lines changed

engines/python/setup/djl_python/async_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ def _extract_lora_adapter(raw_request, decoded_payload):
127127
adapter_name = raw_request.get_property(
128128
SAGEMAKER_ADAPTER_IDENTIFIER_HEADER)
129129
logging.debug(f"Found adapter in headers: {adapter_name}")
130-
elif "adapter" in decoded_payload:
131-
adapter_name = decoded_payload.pop("adapter")
130+
elif "adapters" in decoded_payload:
131+
adapter_name = decoded_payload.get("adapters")
132132
logging.debug(f"Found adapter in payload: {adapter_name}")
133133

134134
return adapter_name

engines/python/setup/djl_python/lmi_vllm/request_response_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
ErrorResponse,
2121
CompletionLogProbs,
2222
)
23-
from vllm.sequence import Logprob
23+
from vllm.logprobs import Logprob
2424
from vllm.transformers_utils.tokenizer import AnyTokenizer
2525

2626
from djl_python.outputs import Output

engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
2424
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
2525
from vllm.entrypoints.openai.serving_models import OpenAIServingModels, BaseModelPath
26-
from vllm.utils import kill_process_tree, AtomicCounter
26+
from vllm.utils.counter import AtomicCounter
27+
from vllm.utils.system_utils import kill_process_tree
2728

2829
from djl_python.properties_manager.hf_properties import HuggingFaceProperties
2930
from djl_python.properties_manager.vllm_rb_properties import VllmRbProperties
@@ -93,7 +94,6 @@ async def initialize(self, properties: dict):
9394
self.vllm_engine = AsyncLLMEngine.from_engine_args(
9495
self.vllm_engine_args)
9596
self.tokenizer = await self.vllm_engine.get_tokenizer()
96-
model_config = await self.vllm_engine.get_model_config()
9797

9898
model_names = self.vllm_engine_args.served_model_name or "lmi"
9999
if not isinstance(model_names, list):
@@ -108,19 +108,16 @@ async def initialize(self, properties: dict):
108108
self.model_name = model_names[0]
109109
self.model_registry = OpenAIServingModels(
110110
self.vllm_engine,
111-
model_config,
112111
base_model_paths,
113112
)
114113
self.completion_service = OpenAIServingCompletion(
115114
self.vllm_engine,
116-
model_config,
117115
self.model_registry,
118116
request_logger=None,
119117
)
120118

121119
self.chat_completion_service = OpenAIServingChat(
122120
self.vllm_engine,
123-
model_config,
124121
self.model_registry,
125122
"assistant",
126123
request_logger=None,
@@ -249,13 +246,14 @@ async def add_request_with_lora(*args, **kwargs):
249246
return await original_add_request(*args, **kwargs)
250247

251248
self.vllm_engine.add_request = add_request_with_lora
252-
253-
try:
249+
try:
250+
response = await processed_request.inference_invoker(
251+
processed_request.vllm_request)
252+
finally:
253+
self.vllm_engine.add_request = original_add_request
254+
else:
254255
response = await processed_request.inference_invoker(
255256
processed_request.vllm_request)
256-
finally:
257-
if processed_request.lora_request:
258-
self.vllm_engine.add_request = original_add_request
259257

260258
if isinstance(response, types.AsyncGeneratorType):
261259
# Apply custom formatter to streaming response

engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
from typing import Optional, Any, Dict, Tuple, Literal, Union
1616
from pydantic import field_validator, model_validator, ConfigDict, Field
1717
from vllm import EngineArgs, AsyncEngineArgs
18-
from vllm.utils import FlexibleArgumentParser
19-
from vllm.utils import StoreBoolean
18+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2019

2120
from djl_python.properties_manager.properties import Properties
2221

@@ -31,22 +30,24 @@
3130
}
3231

3332

34-
def construct_vllm_args_list(vllm_engine_args: dict,
35-
parser: FlexibleArgumentParser):
36-
# Modified from https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/utils.py#L1258
33+
def construct_vllm_args_list(vllm_engine_args: dict):
34+
# Modified from https://github.com/vllm-project/vllm/blob/94666612a938380cb643c1555ef9aa68b7ab1e53/vllm/utils/argparse_utils.py#L441
3735
args_list = []
38-
store_boolean_arguments = {
39-
action.dest
40-
for action in parser._actions if isinstance(action, StoreBoolean)
41-
}
42-
for engine_arg, engine_arg_value in vllm_engine_args.items():
43-
if str(engine_arg_value).lower() in {
44-
'true', 'false'
45-
} and engine_arg not in store_boolean_arguments:
46-
if str(engine_arg_value).lower() == 'true':
47-
args_list.append(f"--{engine_arg}")
36+
for key, value in vllm_engine_args.items():
37+
if str(value).lower() in {'true', 'false'}:
38+
if str(value).lower() == 'true':
39+
args_list.append("--" + key)
40+
elif isinstance(value, bool):
41+
if value:
42+
args_list.append("--" + key)
43+
elif isinstance(value, list):
44+
if value:
45+
args_list.append("--" + key)
46+
for item in value:
47+
args_list.append(str(item))
4848
else:
49-
args_list.append(f"--{engine_arg}={engine_arg_value}")
49+
args_list.append("--" + key)
50+
args_list.append(str(value))
5051
return args_list
5152

5253

@@ -228,7 +229,7 @@ def get_engine_args(self,
228229
)
229230
arg_cls = AsyncEngineArgs if async_engine else EngineArgs
230231
parser = arg_cls.add_cli_args(FlexibleArgumentParser())
231-
args_list = construct_vllm_args_list(vllm_engine_arg_dict, parser)
232+
args_list = construct_vllm_args_list(vllm_engine_arg_dict)
232233
args = parser.parse_args(args=args_list)
233234
engine_args = arg_cls.from_cli_args(args)
234235
# we have to do this separately because vllm converts it into a string

engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
from vllm import LLMEngine, SamplingParams
1616
from vllm.sampling_params import RequestOutputKind
17-
from vllm.utils import random_uuid, AtomicCounter
17+
from vllm.utils import random_uuid
18+
from vllm.utils.counter import AtomicCounter
1819

1920
from djl_python.request import Request
2021
from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
@@ -58,8 +59,7 @@ def __init__(self, model_id_or_path: str, properties: dict,
5859
try:
5960
self.tool_parser = ToolParserManager.get_tool_parser(
6061
self.vllm_configs.tool_call_parser)
61-
self.tool_parser = self.tool_parser(
62-
self.engine.tokenizer.tokenizer)
62+
self.tool_parser = self.tool_parser(self.get_tokenizer())
6363
except Exception as e:
6464
raise TypeError("Error in tool parser creation.") from e
6565
if self.vllm_configs.enable_reasoning:
@@ -68,12 +68,12 @@ def __init__(self, model_id_or_path: str, properties: dict,
6868
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
6969
self.vllm_configs.reasoning_parser)
7070
self.reasoning_parser = self.reasoning_parser(
71-
self.engine.tokenizer.tokenizer)
71+
self.get_tokenizer())
7272
except Exception as e:
7373
raise TypeError("Error in reasoning parser creation.") from e
7474

7575
def get_tokenizer(self):
76-
return self.engine.tokenizer.tokenizer
76+
return self.engine.get_tokenizer()
7777

7878
def get_model_config(self):
7979
return self.engine.model_config

engines/python/src/main/java/ai/djl/python/engine/PyModel.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,12 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
187187
recommendedEntryPoint = "djl_python.transformers_neuronx";
188188
} else if ("trtllm".equals(features)) {
189189
recommendedEntryPoint = "djl_python.tensorrt_llm";
190+
} else if ("vllm".equals(features)) {
191+
if (pyEnv.isAsyncMode()) {
192+
recommendedEntryPoint = "djl_python.lmi_vllm.vllm_async_service";
193+
} else {
194+
recommendedEntryPoint = "djl_python.huggingface";
195+
}
190196
} else if (pyEnv.getInitParameters().containsKey("model_id")
191197
|| Files.exists(modelPath.resolve("config.json"))) {
192198
recommendedEntryPoint = "djl_python.huggingface";

serving/docker/lmi-container-requirements.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
torch==2.8.0
1+
torch==2.9.0
2+
autoawq
23
torchvision
34
peft==0.15.1
45
protobuf==4.25.1
5-
transformers==4.55.2
6+
transformers==4.56.0
67
hf-transfer
78
zstandard
89
datasets==3.0.1
@@ -25,12 +26,12 @@ sentence_transformers
2526
onnxruntime-gpu==1.20.0
2627
autoawq
2728
tokenizers
28-
pydantic==2.11.7
29+
pydantic>=2.12.0
2930
optimum==1.23.2
3031
uvloop
3132
ninja
3233
peft
3334
llmcompressor
34-
vllm==0.10.2
35+
https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
3536
xgrammar
36-
flashinfer-python==0.2.5
37+
flashinfer-python==0.4.1

serving/docker/lmi.Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ RUN scripts/patch_oss_dlc.sh python \
8989
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*
9090

9191
COPY lmi-container-requirements.txt ./requirements.txt
92-
RUN pip3 install torch==2.8.0 torchvision \
92+
RUN pip3 install --upgrade pip setuptools
93+
RUN pip3 install torch==2.9.0 torchvision \
9394
&& pip3 install -r requirements.txt \
9495
&& pip3 install ${djl_converter_wheel} --no-deps
9596

tests/integration/llm/prepare.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,9 +1701,6 @@ def build_vllm_async_model(model):
17011701
)
17021702
options = vllm_model_list[model]
17031703
options["engine"] = "Python"
1704-
options["option.rolling_batch"] = "disable"
1705-
options["option.async_mode"] = "true"
1706-
options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"
17071704

17081705
adapter_ids = options.pop("adapter_ids", [])
17091706
adapter_names = options.pop("adapter_names", [])

tests/integration/tests.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -555,15 +555,15 @@ def test_gemma_2b(self):
555555

556556
def test_llama2_7b_chat(self):
557557
with Runner('lmi', 'llama2-7b-chat') as r:
558-
prepare.build_vllm_model("llama2-7b-chat")
558+
prepare.build_vllm_async_model("llama2-7b-chat")
559559
r.launch()
560560
client.run("vllm_chat llama2-7b-chat".split())
561561

562562
@pytest.mark.skipif(not is_applicable_cuda_capability(89),
563563
reason="Unsupported CUDA capability")
564564
def test_qwen2_7b_fp8(self):
565565
with Runner('lmi', 'qwen2-7b-fp8') as r:
566-
prepare.build_vllm_model("qwen2-7b-fp8")
566+
prepare.build_vllm_async_model("qwen2-7b-fp8")
567567
r.launch()
568568
client.run("vllm qwen2-7b-fp8".split())
569569

@@ -576,7 +576,7 @@ def test_llama3_8b_chunked_prefill(self):
576576

577577
def test_falcon_11b_chunked_prefill(self):
578578
with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
579-
prepare.build_vllm_model("falcon-11b-chunked-prefill")
579+
prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
580580
r.launch()
581581
client.run(
582582
"vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
@@ -589,31 +589,31 @@ def test_llama_68m_speculative_medusa(self):
589589

590590
def test_llama_68m_speculative_eagle(self):
591591
with Runner('lmi', 'llama-68m-speculative-eagle') as r:
592-
prepare.build_vllm_model("llama-68m-speculative-eagle")
592+
prepare.build_vllm_async_model("llama-68m-speculative-eagle")
593593
r.launch()
594594
client.run("vllm llama-68m-speculative-eagle".split())
595595

596596
def test_llama3_1_8b_instruct_tool(self):
597597
with Runner('lmi', 'llama3-1-8b-instruct-tool') as r:
598-
prepare.build_vllm_model("llama3-1-8b-instruct-tool")
598+
prepare.build_vllm_async_model("llama3-1-8b-instruct-tool")
599599
r.launch()
600600
client.run("vllm_tool llama3-1-8b-instruct-tool".split())
601601

602602
def test_mistral_7b_instruct_v03_tool(self):
603603
with Runner('lmi', 'mistral-7b-instruct-v03-tool') as r:
604-
prepare.build_vllm_model("mistral-7b-instruct-v03-tool")
604+
prepare.build_vllm_async_model("mistral-7b-instruct-v03-tool")
605605
r.launch()
606606
client.run("vllm_tool mistral-7b-instruct-v03-tool".split())
607607

608608
def test_deepseek_r1_distill_qwen_1_5b(self):
609609
with Runner('lmi', 'deepseek-r1-distill-qwen-1-5b') as r:
610-
prepare.build_vllm_model("deepseek-r1-distill-qwen-1-5b")
610+
prepare.build_vllm_async_model("deepseek-r1-distill-qwen-1-5b")
611611
r.launch()
612612
client.run("vllm_chat deepseek-r1-distill-qwen-1-5b".split())
613613

614614
def test_tiny_llama_input_length_exceeded(self):
615615
with Runner('lmi', 'tinyllama-test-input-length-exceeded') as r:
616-
prepare.build_vllm_model("tinyllama-input-len-exceeded")
616+
prepare.build_vllm_async_model("tinyllama-input-len-exceeded")
617617
r.launch()
618618
start = time.perf_counter()
619619
with pytest.raises(ValueError, match=r".*424.*"):
@@ -1020,16 +1020,16 @@ def test_phi3_v(self):
10201020

10211021
def test_pixtral_12b(self):
10221022
with Runner('lmi', 'pixtral-12b') as r:
1023-
prepare.build_vllm_model('pixtral-12b')
1023+
prepare.build_vllm_async_model('pixtral-12b')
10241024
r.launch()
10251025
client.run("multimodal pixtral-12b".split())
10261026

1027-
# MLlama is only supported by vllm backend currently
1028-
def test_mllama_11b(self):
1029-
with Runner('lmi', 'llama32-11b-multimodal') as r:
1030-
prepare.build_vllm_model('llama32-11b-multimodal')
1031-
r.launch()
1032-
client.run("multimodal llama32-11b-multimodal".split())
1027+
# MLlama is not supported in vllm v1, see https://github.com/vllm-project/vllm/issues/27198
1028+
# def test_mllama_11b(self):
1029+
# with Runner('lmi', 'llama32-11b-multimodal') as r:
1030+
# prepare.build_vllm_model('llama32-11b-multimodal')
1031+
# r.launch()
1032+
# client.run("multimodal llama32-11b-multimodal".split())
10331033

10341034

10351035
class TestLmiDistPipelineParallel:

0 commit comments

Comments
 (0)