From 6086aea2e7919888005a269c0f2a94cc26be103a Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 02:32:36 +0000 Subject: [PATCH 1/7] Fix race condition in streaming responses --- .../djl_python/lmi_vllm/vllm_async_service.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py index 792e462ab..31d503c11 100644 --- a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py +++ b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py @@ -239,18 +239,9 @@ async def inference( return output if processed_request.lora_request: - original_add_request = self.vllm_engine.add_request - - async def add_request_with_lora(*args, **kwargs): - kwargs['lora_request'] = processed_request.lora_request - return await original_add_request(*args, **kwargs) - - self.vllm_engine.add_request = add_request_with_lora - try: - response = await processed_request.inference_invoker( - processed_request.vllm_request) - finally: - self.vllm_engine.add_request = original_add_request + response = await processed_request.inference_invoker( + processed_request.vllm_request, + lora_request=processed_request.lora_request) else: response = await processed_request.inference_invoker( processed_request.vllm_request) From 1984e7d7432d0d41b471d6fb99998cfc54a53bf7 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 07:12:30 +0000 Subject: [PATCH 2/7] Trying a different interface to vllm lora registry --- .../djl_python/lmi_vllm/vllm_async_service.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py index 31d503c11..3b648e4a0 100644 --- a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py +++ b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py @@ -165,6 +165,10 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest: logging.info( f"Using LoRA request: {lora_request.lora_name} (ID: {lora_request.lora_int_id})" ) + # Register the LoRA request with the model registry so vLLM can find it + self.model_registry.lora_requests[adapter_name] = lora_request + # Set the model field to the adapter name so vLLM's _maybe_get_adapters() can extract it + decoded_payload["model"] = adapter_name # completions request if "prompt" in decoded_payload: @@ -238,13 +242,9 @@ async def inference( "", error=f"Input parsing failed: {str(e)}", code=424) return output - if processed_request.lora_request: - response = await processed_request.inference_invoker( - processed_request.vllm_request, - lora_request=processed_request.lora_request) - else: - response = await processed_request.inference_invoker( - processed_request.vllm_request) + # vLLM will extract the adapter from the request object via _maybe_get_adapters() + response = await processed_request.inference_invoker( + processed_request.vllm_request) if isinstance(response, types.AsyncGeneratorType): # Apply custom formatter to streaming response From 022f0eedf547c441381ba677b8d0e7de9c5d29d4 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 17:53:50 +0000 Subject: [PATCH 3/7] Fixing model name routing for adapter --- .../python/setup/djl_python/lmi_vllm/request_response_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py index bb050cc6d..0d0168845 100644 --- a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py +++ b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py @@ -55,6 +55,7 @@ def convert_lmi_schema_to_completion_request( parameters = payload.get("parameters", {}) completion_dict = { + "model": payload.pop("model"), "prompt": payload.pop("inputs"), "max_tokens": parameters.pop("max_new_tokens", 30), "echo": parameters.pop("return_full_text", False), From 13dfbf8da3917e28a0127562bb336075e4c53a20 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 19:39:07 +0000 Subject: [PATCH 4/7] removing redundant registration --- engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py index 3b648e4a0..f3d216561 100644 --- a/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py +++ b/engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py @@ -165,8 +165,6 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest: logging.info( f"Using LoRA request: {lora_request.lora_name} (ID: {lora_request.lora_int_id})" ) - # Register the LoRA request with the model registry so vLLM can find it - self.model_registry.lora_requests[adapter_name] = lora_request # Set the model field to the adapter name so vLLM's _maybe_get_adapters() can extract it decoded_payload["model"] = adapter_name From f979b033378ce10ea0de5cc5d49bc63907e74a39 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 19:59:39 +0000 Subject: [PATCH 5/7] Adding fallback for backwards compatibility --- .../python/setup/djl_python/lmi_vllm/request_response_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py index 0d0168845..4d1bbdb3c 100644 --- a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py +++ b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py @@ -55,7 +55,7 @@ def convert_lmi_schema_to_completion_request( parameters = payload.get("parameters", {}) completion_dict = { - "model": payload.pop("model"), + "model": payload.pop("model"), None, "prompt": payload.pop("inputs"), "max_tokens": parameters.pop("max_new_tokens", 30), "echo": parameters.pop("return_full_text", False), From 6999ecbcbef0a0002bb412f2afbebd2f9c93e475 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 20:06:26 +0000 Subject: [PATCH 6/7] Adding fallback for backwards compatibility --- .../python/setup/djl_python/lmi_vllm/request_response_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py index 4d1bbdb3c..59a61f559 100644 --- a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py +++ b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py @@ -55,7 +55,7 @@ def convert_lmi_schema_to_completion_request( parameters = payload.get("parameters", {}) completion_dict = { - "model": payload.pop("model"), None, + "model": payload.pop("model", None), "prompt": payload.pop("inputs"), "max_tokens": parameters.pop("max_new_tokens", 30), "echo": parameters.pop("return_full_text", False), From bb5edfc9c9c4c579b9b1006c16f53dcddc9a6368 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 19 Nov 2025 20:35:10 +0000 Subject: [PATCH 7/7] Reverting backwards compatibility change for fast fail behavior --- .../python/setup/djl_python/lmi_vllm/request_response_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py index 59a61f559..0d0168845 100644 --- a/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py +++ b/engines/python/setup/djl_python/lmi_vllm/request_response_utils.py @@ -55,7 +55,7 @@ def convert_lmi_schema_to_completion_request( parameters = payload.get("parameters", {}) completion_dict = { - "model": payload.pop("model", None), + "model": payload.pop("model"), "prompt": payload.pop("inputs"), "max_tokens": parameters.pop("max_new_tokens", 30), "echo": parameters.pop("return_full_text", False),