Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions engines/python/setup/djl_python/async_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def _extract_lora_adapter(raw_request, decoded_payload):
adapter_name = raw_request.get_property(
SAGEMAKER_ADAPTER_IDENTIFIER_HEADER)
logging.debug(f"Found adapter in headers: {adapter_name}")
elif "adapter" in decoded_payload:
adapter_name = decoded_payload.pop("adapter")
elif "adapters" in decoded_payload:
adapter_name = decoded_payload.get("adapters")
logging.debug(f"Found adapter in payload: {adapter_name}")

return adapter_name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
ErrorResponse,
CompletionLogProbs,
)
from vllm.sequence import Logprob
from vllm.logprobs import Logprob
from vllm.transformers_utils.tokenizer import AnyTokenizer

from djl_python.outputs import Output
Expand Down
18 changes: 8 additions & 10 deletions engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import OpenAIServingModels, BaseModelPath
from vllm.utils import kill_process_tree, AtomicCounter
from vllm.utils.counter import AtomicCounter
from vllm.utils.system_utils import kill_process_tree

from djl_python.properties_manager.hf_properties import HuggingFaceProperties
from djl_python.properties_manager.vllm_rb_properties import VllmRbProperties
Expand Down Expand Up @@ -93,7 +94,6 @@ async def initialize(self, properties: dict):
self.vllm_engine = AsyncLLMEngine.from_engine_args(
self.vllm_engine_args)
self.tokenizer = await self.vllm_engine.get_tokenizer()
model_config = await self.vllm_engine.get_model_config()

model_names = self.vllm_engine_args.served_model_name or "lmi"
if not isinstance(model_names, list):
Expand All @@ -108,19 +108,16 @@ async def initialize(self, properties: dict):
self.model_name = model_names[0]
self.model_registry = OpenAIServingModels(
self.vllm_engine,
model_config,
base_model_paths,
)
self.completion_service = OpenAIServingCompletion(
self.vllm_engine,
model_config,
self.model_registry,
request_logger=None,
)

self.chat_completion_service = OpenAIServingChat(
self.vllm_engine,
model_config,
self.model_registry,
"assistant",
request_logger=None,
Expand Down Expand Up @@ -249,13 +246,14 @@ async def add_request_with_lora(*args, **kwargs):
return await original_add_request(*args, **kwargs)

self.vllm_engine.add_request = add_request_with_lora

try:
try:
response = await processed_request.inference_invoker(
processed_request.vllm_request)
finally:
self.vllm_engine.add_request = original_add_request
else:
response = await processed_request.inference_invoker(
processed_request.vllm_request)
finally:
if processed_request.lora_request:
self.vllm_engine.add_request = original_add_request

if isinstance(response, types.AsyncGeneratorType):
# Apply custom formatter to streaming response
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from typing import Optional, Any, Dict, Tuple, Literal, Union
from pydantic import field_validator, model_validator, ConfigDict, Field
from vllm import EngineArgs, AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils import StoreBoolean
from vllm.utils.argparse_utils import FlexibleArgumentParser

from djl_python.properties_manager.properties import Properties

Expand All @@ -31,22 +30,24 @@
}


def construct_vllm_args_list(vllm_engine_args: dict,
parser: FlexibleArgumentParser):
# Modified from https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/utils.py#L1258
def construct_vllm_args_list(vllm_engine_args: dict):
# Modified from https://github.com/vllm-project/vllm/blob/94666612a938380cb643c1555ef9aa68b7ab1e53/vllm/utils/argparse_utils.py#L441
args_list = []
store_boolean_arguments = {
action.dest
for action in parser._actions if isinstance(action, StoreBoolean)
}
for engine_arg, engine_arg_value in vllm_engine_args.items():
if str(engine_arg_value).lower() in {
'true', 'false'
} and engine_arg not in store_boolean_arguments:
if str(engine_arg_value).lower() == 'true':
args_list.append(f"--{engine_arg}")
for key, value in vllm_engine_args.items():
if str(value).lower() in {'true', 'false'}:
if str(value).lower() == 'true':
args_list.append("--" + key)
elif isinstance(value, bool):
if value:
args_list.append("--" + key)
elif isinstance(value, list):
if value:
args_list.append("--" + key)
for item in value:
args_list.append(str(item))
else:
args_list.append(f"--{engine_arg}={engine_arg_value}")
args_list.append("--" + key)
args_list.append(str(value))
return args_list


Expand Down Expand Up @@ -228,7 +229,7 @@ def get_engine_args(self,
)
arg_cls = AsyncEngineArgs if async_engine else EngineArgs
parser = arg_cls.add_cli_args(FlexibleArgumentParser())
args_list = construct_vllm_args_list(vllm_engine_arg_dict, parser)
args_list = construct_vllm_args_list(vllm_engine_arg_dict)
args = parser.parse_args(args=args_list)
engine_args = arg_cls.from_cli_args(args)
# we have to do this separately because vllm converts it into a string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

from vllm import LLMEngine, SamplingParams
from vllm.sampling_params import RequestOutputKind
from vllm.utils import random_uuid, AtomicCounter
from vllm.utils import random_uuid
from vllm.utils.counter import AtomicCounter

from djl_python.request import Request
from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
Expand Down Expand Up @@ -58,8 +59,7 @@ def __init__(self, model_id_or_path: str, properties: dict,
try:
self.tool_parser = ToolParserManager.get_tool_parser(
self.vllm_configs.tool_call_parser)
self.tool_parser = self.tool_parser(
self.engine.tokenizer.tokenizer)
self.tool_parser = self.tool_parser(self.get_tokenizer())
except Exception as e:
raise TypeError("Error in tool parser creation.") from e
if self.vllm_configs.enable_reasoning:
Expand All @@ -68,12 +68,12 @@ def __init__(self, model_id_or_path: str, properties: dict,
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
self.vllm_configs.reasoning_parser)
self.reasoning_parser = self.reasoning_parser(
self.engine.tokenizer.tokenizer)
self.get_tokenizer())
except Exception as e:
raise TypeError("Error in reasoning parser creation.") from e

def get_tokenizer(self):
return self.engine.tokenizer.tokenizer
return self.engine.get_tokenizer()

def get_model_config(self):
return self.engine.model_config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
recommendedEntryPoint = "djl_python.transformers_neuronx";
} else if ("trtllm".equals(features)) {
recommendedEntryPoint = "djl_python.tensorrt_llm";
} else if ("vllm".equals(features)) {
if (pyEnv.isAsyncMode()) {
recommendedEntryPoint = "djl_python.lmi_vllm.vllm_async_service";
} else {
recommendedEntryPoint = "djl_python.huggingface";
}
} else if (pyEnv.getInitParameters().containsKey("model_id")
|| Files.exists(modelPath.resolve("config.json"))) {
recommendedEntryPoint = "djl_python.huggingface";
Expand Down
11 changes: 6 additions & 5 deletions serving/docker/lmi-container-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
torch==2.8.0
torch==2.9.0
autoawq
torchvision
peft==0.15.1
protobuf==4.25.1
transformers==4.55.2
transformers==4.56.0
hf-transfer
zstandard
datasets==3.0.1
Expand All @@ -25,12 +26,12 @@ sentence_transformers
onnxruntime-gpu==1.20.0
autoawq
tokenizers
pydantic==2.11.7
pydantic>=2.12.0
optimum==1.23.2
uvloop
ninja
peft
llmcompressor
vllm==0.10.2
https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to fix the wheel name to be consistent with the OSS vllm version. This is confusing.

Copy link
Copy Markdown
Contributor

@xyang16 xyang16 Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is nightly wheel from vllm. Not from us. We will move to 0.11.1 wheel once it's released.

xgrammar
flashinfer-python==0.2.5
flashinfer-python==0.4.1
3 changes: 2 additions & 1 deletion serving/docker/lmi.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ RUN scripts/patch_oss_dlc.sh python \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/*

COPY lmi-container-requirements.txt ./requirements.txt
RUN pip3 install torch==2.8.0 torchvision \
RUN pip3 install --upgrade pip setuptools
RUN pip3 install torch==2.9.0 torchvision \
&& pip3 install -r requirements.txt \
&& pip3 install ${djl_converter_wheel} --no-deps

Expand Down
3 changes: 0 additions & 3 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,9 +1701,6 @@ def build_vllm_async_model(model):
)
options = vllm_model_list[model]
options["engine"] = "Python"
options["option.rolling_batch"] = "disable"
options["option.async_mode"] = "true"
options["option.entryPoint"] = "djl_python.lmi_vllm.vllm_async_service"

adapter_ids = options.pop("adapter_ids", [])
adapter_names = options.pop("adapter_names", [])
Expand Down
30 changes: 15 additions & 15 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,15 +555,15 @@ def test_gemma_2b(self):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add some tests for the new models supported in vLLM 0.11, atleast Qwen3-VL and Qwen3-Next

def test_llama2_7b_chat(self):
with Runner('lmi', 'llama2-7b-chat') as r:
prepare.build_vllm_model("llama2-7b-chat")
prepare.build_vllm_async_model("llama2-7b-chat")
r.launch()
client.run("vllm_chat llama2-7b-chat".split())

@pytest.mark.skipif(not is_applicable_cuda_capability(89),
reason="Unsupported CUDA capability")
def test_qwen2_7b_fp8(self):
with Runner('lmi', 'qwen2-7b-fp8') as r:
prepare.build_vllm_model("qwen2-7b-fp8")
prepare.build_vllm_async_model("qwen2-7b-fp8")
r.launch()
client.run("vllm qwen2-7b-fp8".split())

Expand All @@ -576,7 +576,7 @@ def test_llama3_8b_chunked_prefill(self):

def test_falcon_11b_chunked_prefill(self):
with Runner('lmi', 'falcon-11b-chunked-prefill') as r:
prepare.build_vllm_model("falcon-11b-chunked-prefill")
prepare.build_vllm_async_model("falcon-11b-chunked-prefill")
r.launch()
client.run(
"vllm falcon-11b-chunked-prefill --in_tokens 1200".split())
Expand All @@ -589,31 +589,31 @@ def test_llama_68m_speculative_medusa(self):

def test_llama_68m_speculative_eagle(self):
with Runner('lmi', 'llama-68m-speculative-eagle') as r:
prepare.build_vllm_model("llama-68m-speculative-eagle")
prepare.build_vllm_async_model("llama-68m-speculative-eagle")
r.launch()
client.run("vllm llama-68m-speculative-eagle".split())

def test_llama3_1_8b_instruct_tool(self):
with Runner('lmi', 'llama3-1-8b-instruct-tool') as r:
prepare.build_vllm_model("llama3-1-8b-instruct-tool")
prepare.build_vllm_async_model("llama3-1-8b-instruct-tool")
r.launch()
client.run("vllm_tool llama3-1-8b-instruct-tool".split())

def test_mistral_7b_instruct_v03_tool(self):
with Runner('lmi', 'mistral-7b-instruct-v03-tool') as r:
prepare.build_vllm_model("mistral-7b-instruct-v03-tool")
prepare.build_vllm_async_model("mistral-7b-instruct-v03-tool")
r.launch()
client.run("vllm_tool mistral-7b-instruct-v03-tool".split())

def test_deepseek_r1_distill_qwen_1_5b(self):
with Runner('lmi', 'deepseek-r1-distill-qwen-1-5b') as r:
prepare.build_vllm_model("deepseek-r1-distill-qwen-1-5b")
prepare.build_vllm_async_model("deepseek-r1-distill-qwen-1-5b")
r.launch()
client.run("vllm_chat deepseek-r1-distill-qwen-1-5b".split())

def test_tiny_llama_input_length_exceeded(self):
with Runner('lmi', 'tinyllama-test-input-length-exceeded') as r:
prepare.build_vllm_model("tinyllama-input-len-exceeded")
prepare.build_vllm_async_model("tinyllama-input-len-exceeded")
r.launch()
start = time.perf_counter()
with pytest.raises(ValueError, match=r".*424.*"):
Expand Down Expand Up @@ -1020,16 +1020,16 @@ def test_phi3_v(self):

def test_pixtral_12b(self):
with Runner('lmi', 'pixtral-12b') as r:
prepare.build_vllm_model('pixtral-12b')
prepare.build_vllm_async_model('pixtral-12b')
r.launch()
client.run("multimodal pixtral-12b".split())

# MLlama is only supported by vllm backend currently
def test_mllama_11b(self):
with Runner('lmi', 'llama32-11b-multimodal') as r:
prepare.build_vllm_model('llama32-11b-multimodal')
r.launch()
client.run("multimodal llama32-11b-multimodal".split())
# MLlama is not supported in vllm v1, see https://github.com/vllm-project/vllm/issues/27198
# def test_mllama_11b(self):
# with Runner('lmi', 'llama32-11b-multimodal') as r:
# prepare.build_vllm_model('llama32-11b-multimodal')
# r.launch()
# client.run("multimodal llama32-11b-multimodal".split())


class TestLmiDistPipelineParallel:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,13 @@ private static void setRollingBatch(
rollingBatch = "tnx";
}
} else if (isVllmEnabled(features)) {
rollingBatch = "vllm";
rollingBatch = "disable";
lmiProperties.setProperty("option.async_mode", "true");
} else if (isTrtLlmEnabled(features)) {
rollingBatch = "trtllm";
} else {
rollingBatch = "disable";
lmiProperties.setProperty("option.async_mode", "true");
}
lmiProperties.setProperty("option.rolling_batch", rollingBatch);
}
Expand Down
14 changes: 7 additions & 7 deletions wlm/src/test/java/ai/djl/serving/wlm/ModelInfoTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -230,14 +230,14 @@ public void testInferLmiEngine() throws IOException, ModelException {
System.setProperty("SERVING_FEATURES", "vllm");
Map<String, String> modelToRollingBatch =
Map.of(
"TheBloke/Llama-2-7B-fp16", "vllm",
"openai-community/gpt2", "vllm",
"tiiuae/falcon-7b", "vllm",
"NousResearch/Hermes-2-Pro-Mistral-7B", "vllm",
"src/test/resources/local-hf-model", "vllm",
"TheBloke/Llama-2-7B-fp16", "disable",
"openai-community/gpt2", "disable",
"tiiuae/falcon-7b", "disable",
"NousResearch/Hermes-2-Pro-Mistral-7B", "disable",
"src/test/resources/local-hf-model", "disable",
"HuggingFaceH4/tiny-random-LlamaForSequenceClassification", "disable",
"THUDM/chatglm3-6b", "vllm",
"src/test/resources/local-mistral-model", "vllm");
"THUDM/chatglm3-6b", "disable",
"src/test/resources/local-mistral-model", "disable");
Path modelStore = Paths.get("build/models");
Path modelDir = modelStore.resolve("lmi_test_model");
Path prop = modelDir.resolve("serving.properties");
Expand Down
Loading