Skip to content

Commit aec4393

Browse files
ksuma2109LokiiiiiiSuma Kasa
authored
Cherrypick LoRA fix and vLLM upgrade changes to Release branch (#2958)
Co-authored-by: Loki <lokravi@amazon.com> Co-authored-by: Suma Kasa <sumakasa@amazon.com>
1 parent d3561e5 commit aec4393

File tree

4 files changed

+8
-17
lines changed

4 files changed

+8
-17
lines changed

engines/python/setup/djl_python/lmi_vllm/request_response_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def convert_lmi_schema_to_completion_request(
5555
parameters = payload.get("parameters", {})
5656

5757
completion_dict = {
58+
"model": payload.pop("model"),
5859
"prompt": payload.pop("inputs"),
5960
"max_tokens": parameters.pop("max_new_tokens", 30),
6061
"echo": parameters.pop("return_full_text", False),

engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest:
165165
logging.info(
166166
f"Using LoRA request: {lora_request.lora_name} (ID: {lora_request.lora_int_id})"
167167
)
168+
# Set the model field to the adapter name so vLLM's _maybe_get_adapters() can extract it
169+
decoded_payload["model"] = adapter_name
168170

169171
# completions request
170172
if "prompt" in decoded_payload:
@@ -238,22 +240,9 @@ async def inference(
238240
"", error=f"Input parsing failed: {str(e)}", code=424)
239241
return output
240242

241-
if processed_request.lora_request:
242-
original_add_request = self.vllm_engine.add_request
243-
244-
async def add_request_with_lora(*args, **kwargs):
245-
kwargs['lora_request'] = processed_request.lora_request
246-
return await original_add_request(*args, **kwargs)
247-
248-
self.vllm_engine.add_request = add_request_with_lora
249-
try:
250-
response = await processed_request.inference_invoker(
251-
processed_request.vllm_request)
252-
finally:
253-
self.vllm_engine.add_request = original_add_request
254-
else:
255-
response = await processed_request.inference_invoker(
256-
processed_request.vllm_request)
243+
# vLLM will extract the adapter from the request object via _maybe_get_adapters()
244+
response = await processed_request.inference_invoker(
245+
processed_request.vllm_request)
257246

258247
if isinstance(response, types.AsyncGeneratorType):
259248
# Apply custom formatter to streaming response

serving/docker/lmi-container-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ uvloop
3232
ninja
3333
peft
3434
llmcompressor
35-
https://vllm-wheels.s3.us-west-2.amazonaws.com/93103575ce0480f36fc1a3603eb51d9a89f38a00/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
35+
vllm==0.11.1
3636
xgrammar
3737
flashinfer-python==0.5.2
3838
lmcache

tests/integration/llm/prepare.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@
162162
'{"method":"eagle","model":"yuhuili/EAGLE-LLaMA3.1-Instruct-8B","num_speculative_tokens":4}',
163163
"option.tensor_parallel_degree": 4,
164164
"option.max_rolling_batch_size": 4,
165+
"option.enforce_eager": True,
165166
},
166167
"llama-7b-unmerged-lora": {
167168
"option.model_id": "s3://djl-llm/huggyllama-llama-7b",

0 commit comments

Comments
 (0)