Skip to content

Commit 62959ea

Browse files
committed
Add fixes for vllm 0.11.1
1 parent f228a8a commit 62959ea

File tree

13 files changed

+63
-122
lines changed

13 files changed

+63
-122
lines changed

engines/python/setup/djl_python/async_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ def _extract_lora_adapter(raw_request, decoded_payload):
127127
adapter_name = raw_request.get_property(
128128
SAGEMAKER_ADAPTER_IDENTIFIER_HEADER)
129129
logging.debug(f"Found adapter in headers: {adapter_name}")
130-
elif "adapter" in decoded_payload:
131-
adapter_name = decoded_payload.get("adapter")
130+
elif "adapters" in decoded_payload:
131+
adapter_name = decoded_payload.get("adapters")
132132
logging.debug(f"Found adapter in payload: {adapter_name}")
133133

134134
return adapter_name

engines/python/setup/djl_python/lmi_vllm/request_response_utils.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
1212
# the specific language governing permissions and limitations under the License.
1313
import json
14-
import logging
1514
from typing import Callable, Tuple, Union, List, Dict
1615
from vllm.entrypoints.openai.protocol import (
1716
CompletionRequest,
@@ -27,8 +26,6 @@
2726
from djl_python.outputs import Output
2827
from djl_python.async_utils import create_non_stream_output, create_stream_chunk_output
2928

30-
logger = logging.getLogger(__name__)
31-
3229

3330
class ProcessedRequest:
3431

@@ -55,21 +52,16 @@ def __init__(
5552

5653
def convert_lmi_schema_to_completion_request(
5754
payload: dict, ) -> Tuple[CompletionRequest, bool, bool]:
58-
# Create a copy to avoid mutating the original
59-
parameters = payload.get("parameters", {}).copy()
60-
61-
prompt = payload.get("inputs", "")
62-
if not prompt:
63-
raise ValueError("Input prompt cannot be empty")
55+
parameters = payload.get("parameters", {})
6456

6557
completion_dict = {
66-
"prompt": prompt,
58+
"prompt": payload.pop("inputs"),
6759
"max_tokens": parameters.pop("max_new_tokens", 30),
6860
"echo": parameters.pop("return_full_text", False),
6961
"truncate_prompt_tokens": parameters.pop("truncate", None),
7062
"n": parameters.pop("top_n_tokens", 1),
7163
"ignore_eos": parameters.pop("ignore_eos_token", False),
72-
"stream": payload.get("stream", False),
64+
"stream": payload.pop("stream", False),
7365
}
7466
# 1. when details are requested, return token details for the likely tokens (logprobs=1)
7567
# TGI only returns prompt token details when details is also enabled

engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
1111
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
1212
# the specific language governing permissions and limitations under the License.
13-
import asyncio
14-
import copy
1513
import logging
1614
import os
1715
import types
@@ -25,7 +23,7 @@
2523
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
2624
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
2725
from vllm.entrypoints.openai.serving_models import OpenAIServingModels, BaseModelPath
28-
from vllm.utils import AtomicCounter
26+
from vllm.utils.counter import AtomicCounter
2927
from vllm.utils.system_utils import kill_process_tree
3028

3129
from djl_python.properties_manager.hf_properties import HuggingFaceProperties
@@ -77,7 +75,6 @@ def __init__(self):
7775
self.adapter_registry = {}
7876
self.lora_id_counter = AtomicCounter(0)
7977
self.lora_requests = {}
80-
self._lora_lock = asyncio.Lock()
8178

8279
async def initialize(self, properties: dict):
8380
self.hf_configs = HuggingFaceProperties(**properties)
@@ -97,7 +94,6 @@ async def initialize(self, properties: dict):
9794
self.vllm_engine = AsyncLLMEngine.from_engine_args(
9895
self.vllm_engine_args)
9996
self.tokenizer = await self.vllm_engine.get_tokenizer()
100-
model_config = self.vllm_engine.model_config
10197

10298
model_names = self.vllm_engine_args.served_model_name or "lmi"
10399
if not isinstance(model_names, list):
@@ -143,9 +139,6 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest:
143139
session = get_session(self.session_manager, raw_request)
144140
content_type = raw_request.get_property("Content-Type")
145141
decoded_payload = decode(raw_request, content_type)
146-
# Create a deep copy to prevent mutations from affecting the original
147-
decoded_payload = copy.deepcopy(decoded_payload)
148-
logger.info(f"Decoded payload after deepcopy: inputs={decoded_payload.get('inputs', 'N/A')}, stream={decoded_payload.get('stream', 'N/A')}")
149142

150143
adapter_name = _extract_lora_adapter(raw_request, decoded_payload)
151144

@@ -181,10 +174,8 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest:
181174
stream_output_formatter = vllm_stream_output_formatter
182175
# TGI request gets mapped to completions
183176
elif "inputs" in decoded_payload:
184-
logger.info(f"Before convert_lmi_schema: inputs={decoded_payload.get('inputs', 'N/A')}")
185177
vllm_request, include_details, include_prompt = convert_lmi_schema_to_completion_request(
186178
decoded_payload)
187-
logger.info(f"After convert_lmi_schema: vllm_request.prompt={vllm_request.prompt if hasattr(vllm_request, 'prompt') else 'N/A'}")
188179
vllm_invoke_function = self.completion_service.create_completion
189180
non_stream_output_formatter = lmi_with_details_non_stream_output_formatter if include_details else lmi_non_stream_output_formatter
190181
stream_output_formatter = lmi_with_details_stream_output_formatter if include_details else lmi_stream_output_formatter
@@ -248,7 +239,6 @@ async def inference(
248239
return output
249240

250241
if processed_request.lora_request:
251-
logger.info(f"Processing LoRA request: {processed_request.lora_request.lora_name}")
252242
original_add_request = self.vllm_engine.add_request
253243

254244
async def add_request_with_lora(*args, **kwargs):

engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
from typing import Optional, Any, Dict, Tuple, Literal, Union
1616
from pydantic import field_validator, model_validator, ConfigDict, Field
1717
from vllm import EngineArgs, AsyncEngineArgs
18-
from vllm.utils import FlexibleArgumentParser
19-
from vllm.utils import StoreBoolean
18+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2019

2120
from djl_python.properties_manager.properties import Properties
2221

@@ -31,22 +30,24 @@
3130
}
3231

3332

34-
def construct_vllm_args_list(vllm_engine_args: dict,
35-
parser: FlexibleArgumentParser):
36-
# Modified from https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/utils.py#L1258
33+
def construct_vllm_args_list(vllm_engine_args: dict):
34+
# Modified from https://github.com/vllm-project/vllm/blob/94666612a938380cb643c1555ef9aa68b7ab1e53/vllm/utils/argparse_utils.py#L441
3735
args_list = []
38-
store_boolean_arguments = {
39-
action.dest
40-
for action in parser._actions if isinstance(action, StoreBoolean)
41-
}
42-
for engine_arg, engine_arg_value in vllm_engine_args.items():
43-
if str(engine_arg_value).lower() in {
44-
'true', 'false'
45-
} and engine_arg not in store_boolean_arguments:
46-
if str(engine_arg_value).lower() == 'true':
47-
args_list.append(f"--{engine_arg}")
36+
for key, value in vllm_engine_args.items():
37+
if str(value).lower() in {'true', 'false'}:
38+
if str(value).lower() == 'true':
39+
args_list.append("--" + key)
40+
elif isinstance(value, bool):
41+
if value:
42+
args_list.append("--" + key)
43+
elif isinstance(value, list):
44+
if value:
45+
args_list.append("--" + key)
46+
for item in value:
47+
args_list.append(str(item))
4848
else:
49-
args_list.append(f"--{engine_arg}={engine_arg_value}")
49+
args_list.append("--" + key)
50+
args_list.append(str(value))
5051
return args_list
5152

5253

@@ -228,7 +229,7 @@ def get_engine_args(self,
228229
)
229230
arg_cls = AsyncEngineArgs if async_engine else EngineArgs
230231
parser = arg_cls.add_cli_args(FlexibleArgumentParser())
231-
args_list = construct_vllm_args_list(vllm_engine_arg_dict, parser)
232+
args_list = construct_vllm_args_list(vllm_engine_arg_dict)
232233
args = parser.parse_args(args=args_list)
233234
engine_args = arg_cls.from_cli_args(args)
234235
# we have to do this separately because vllm converts it into a string

engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414

1515
from vllm import LLMEngine, SamplingParams
1616
from vllm.sampling_params import RequestOutputKind
17-
from vllm.utils import random_uuid, AtomicCounter
17+
from vllm.utils import random_uuid
18+
from vllm.utils.counter import AtomicCounter
1819

1920
from djl_python.request import Request
2021
from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
@@ -58,8 +59,7 @@ def __init__(self, model_id_or_path: str, properties: dict,
5859
try:
5960
self.tool_parser = ToolParserManager.get_tool_parser(
6061
self.vllm_configs.tool_call_parser)
61-
self.tool_parser = self.tool_parser(
62-
self.engine.tokenizer.tokenizer)
62+
self.tool_parser = self.tool_parser(self.get_tokenizer())
6363
except Exception as e:
6464
raise TypeError("Error in tool parser creation.") from e
6565
if self.vllm_configs.enable_reasoning:
@@ -68,12 +68,12 @@ def __init__(self, model_id_or_path: str, properties: dict,
6868
self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
6969
self.vllm_configs.reasoning_parser)
7070
self.reasoning_parser = self.reasoning_parser(
71-
self.engine.tokenizer.tokenizer)
71+
self.get_tokenizer())
7272
except Exception as e:
7373
raise TypeError("Error in reasoning parser creation.") from e
7474

7575
def get_tokenizer(self):
76-
return self.engine.tokenizer.tokenizer
76+
return self.engine.get_tokenizer()
7777

7878
def get_model_config(self):
7979
return self.engine.model_config

engines/python/src/main/java/ai/djl/python/engine/PyModel.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,10 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
188188
} else if ("trtllm".equals(features)) {
189189
recommendedEntryPoint = "djl_python.tensorrt_llm";
190190
} else if ("vllm".equals(features)) {
191-
recommendedEntryPoint = "djl_python.lmi_vllm.vllm_async_service";
192-
pyEnv.setAsyncMode(true);
193-
if (!properties.containsKey("rolling_batch")) {
194-
setProperty("rolling_batch", "disable");
191+
if (pyEnv.isAsyncMode()) {
192+
recommendedEntryPoint = "djl_python.lmi_vllm.vllm_async_service";
193+
} else {
194+
recommendedEntryPoint = "djl_python.huggingface";
195195
}
196196
} else if (pyEnv.getInitParameters().containsKey("model_id")
197197
|| Files.exists(modelPath.resolve("config.json"))) {

engines/python/src/test/java/ai/djl/python/engine/PyEngineTest.java

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -599,43 +599,4 @@ public void testRestartProcess() throws IOException, ModelException, Interrupted
599599
Assert.assertEquals(output.getCode(), 200);
600600
}
601601
}
602-
603-
@Test
604-
public void testVllmFeaturesRollingBatch() throws IOException, ModelException {
605-
System.setProperty("SERVING_FEATURES", "vllm");
606-
try {
607-
Criteria<Input, Output> criteria =
608-
Criteria.builder()
609-
.setTypes(Input.class, Output.class)
610-
.optModelPath(Paths.get("src/test/resources/echo"))
611-
.optEngine("Python")
612-
.build();
613-
try (ZooModel<Input, Output> model = criteria.loadModel()) {
614-
// Verify rolling_batch is set to disable when features=vllm
615-
Assert.assertEquals(model.getProperty("rolling_batch"), "disable");
616-
}
617-
} finally {
618-
System.clearProperty("SERVING_FEATURES");
619-
}
620-
}
621-
622-
@Test
623-
public void testVllmFeaturesRollingBatchOverride() throws IOException, ModelException {
624-
System.setProperty("SERVING_FEATURES", "vllm");
625-
try {
626-
Criteria<Input, Output> criteria =
627-
Criteria.builder()
628-
.setTypes(Input.class, Output.class)
629-
.optModelPath(Paths.get("src/test/resources/echo"))
630-
.optOption("rolling_batch", "vllm")
631-
.optEngine("Python")
632-
.build();
633-
try (ZooModel<Input, Output> model = criteria.loadModel()) {
634-
// Verify user override is respected
635-
Assert.assertEquals(model.getProperty("rolling_batch"), "vllm");
636-
}
637-
} finally {
638-
System.clearProperty("SERVING_FEATURES");
639-
}
640-
}
641602
}

serving/docker/lmi-container-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ uvloop
3232
ninja
3333
peft
3434
llmcompressor
35-
vllm @ git+https://github.com/vllm-project/vllm.git
35+
https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
3636
xgrammar
3737
flashinfer-python==0.4.1

tests/integration/llm/client.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,7 +1800,7 @@ def test_handler_adapters(model, model_spec):
18001800
}
18011801
req["parameters"] = params
18021802
req["adapters"] = adapter
1803-
reqs.append(req.copy())
1803+
reqs.append(req)
18041804
for req in reqs:
18051805
for stream in stream_values:
18061806
req["stream"] = stream
@@ -1830,19 +1830,13 @@ def test_handler_adapters(model, model_spec):
18301830
LOGGER.info(f"del adapter {res}")
18311831
headers = {'content-type': 'application/json'}
18321832
endpoint = f"http://127.0.0.1:8080/invocations"
1833-
# Create a fresh copy to avoid using mutated request
1834-
import copy
1835-
req0_copy = copy.deepcopy(reqs[0])
18361833
res = requests.post(endpoint, headers=headers,
1837-
json=req0_copy).content.decode("utf-8")
1834+
json=reqs[0]).content.decode("utf-8")
18381835
LOGGER.info(f"call deleted adapter {res}")
18391836

18401837
if len(reqs) > 1:
1841-
# Create a fresh copy to avoid using mutated request
1842-
req1_copy = copy.deepcopy(reqs[1])
1843-
LOGGER.info(f"Request being sent: {req1_copy}")
18441838
res = requests.post(endpoint, headers=headers,
1845-
json=req1_copy).content.decode("utf-8")
1839+
json=reqs[1]).content.decode("utf-8")
18461840
LOGGER.info(f"call valid adapter after deletion {res}")
18471841
if not res or res.strip() == "":
18481842
LOGGER.error(f"Empty response received from model API: {res}")
@@ -1878,6 +1872,7 @@ def test_handler_adapters(model, model_spec):
18781872
LOGGER.error(msg)
18791873
raise RuntimeError(msg)
18801874

1875+
18811876
def test_handler_rolling_batch_chat(model, model_spec):
18821877
modelspec_checker(model, model_spec)
18831878
spec = model_spec[args.model]

tests/integration/llm/prepare.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,7 @@ def build_vllm_model(model):
17771777
)
17781778
options = vllm_model_list[model]
17791779
options["engine"] = "Python"
1780+
options["option.rolling_batch"] = "vllm"
17801781

17811782
adapter_ids = options.pop("adapter_ids", [])
17821783
adapter_names = options.pop("adapter_names", [])

0 commit comments

Comments
 (0)