1010# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
1111# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
1212# the specific language governing permissions and limitations under the License.
13+ import asyncio
14+ import copy
1315import logging
1416import os
1517import types
2325from vllm .entrypoints .openai .serving_chat import OpenAIServingChat
2426from vllm .entrypoints .openai .serving_completion import OpenAIServingCompletion
2527from vllm .entrypoints .openai .serving_models import OpenAIServingModels , BaseModelPath
26- from vllm .utils import kill_process_tree , AtomicCounter
28+ from vllm .utils import AtomicCounter
29+ from vllm .utils .system_utils import kill_process_tree
2730
2831from djl_python .properties_manager .hf_properties import HuggingFaceProperties
2932from djl_python .properties_manager .vllm_rb_properties import VllmRbProperties
@@ -74,6 +77,7 @@ def __init__(self):
7477 self .adapter_registry = {}
7578 self .lora_id_counter = AtomicCounter (0 )
7679 self .lora_requests = {}
80+ self ._lora_lock = asyncio .Lock ()
7781
7882 async def initialize (self , properties : dict ):
7983 self .hf_configs = HuggingFaceProperties (** properties )
@@ -93,7 +97,7 @@ async def initialize(self, properties: dict):
9397 self .vllm_engine = AsyncLLMEngine .from_engine_args (
9498 self .vllm_engine_args )
9599 self .tokenizer = await self .vllm_engine .get_tokenizer ()
96- model_config = await self .vllm_engine .get_model_config ()
100+ model_config = self .vllm_engine .model_config
97101
98102 model_names = self .vllm_engine_args .served_model_name or "lmi"
99103 if not isinstance (model_names , list ):
@@ -108,19 +112,16 @@ async def initialize(self, properties: dict):
108112 self .model_name = model_names [0 ]
109113 self .model_registry = OpenAIServingModels (
110114 self .vllm_engine ,
111- model_config ,
112115 base_model_paths ,
113116 )
114117 self .completion_service = OpenAIServingCompletion (
115118 self .vllm_engine ,
116- model_config ,
117119 self .model_registry ,
118120 request_logger = None ,
119121 )
120122
121123 self .chat_completion_service = OpenAIServingChat (
122124 self .vllm_engine ,
123- model_config ,
124125 self .model_registry ,
125126 "assistant" ,
126127 request_logger = None ,
@@ -142,6 +143,9 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest:
142143 session = get_session (self .session_manager , raw_request )
143144 content_type = raw_request .get_property ("Content-Type" )
144145 decoded_payload = decode (raw_request , content_type )
146+ # Create a deep copy to prevent mutations from affecting the original
147+ decoded_payload = copy .deepcopy (decoded_payload )
148+ logger .info (f"Decoded payload after deepcopy: inputs={ decoded_payload .get ('inputs' , 'N/A' )} , stream={ decoded_payload .get ('stream' , 'N/A' )} " )
145149
146150 adapter_name = _extract_lora_adapter (raw_request , decoded_payload )
147151
@@ -177,8 +181,10 @@ def preprocess_request(self, inputs: Input) -> ProcessedRequest:
177181 stream_output_formatter = vllm_stream_output_formatter
178182 # TGI request gets mapped to completions
179183 elif "inputs" in decoded_payload :
184+ logger .info (f"Before convert_lmi_schema: inputs={ decoded_payload .get ('inputs' , 'N/A' )} " )
180185 vllm_request , include_details , include_prompt = convert_lmi_schema_to_completion_request (
181186 decoded_payload )
187+ logger .info (f"After convert_lmi_schema: vllm_request.prompt={ vllm_request .prompt if hasattr (vllm_request , 'prompt' ) else 'N/A' } " )
182188 vllm_invoke_function = self .completion_service .create_completion
183189 non_stream_output_formatter = lmi_with_details_non_stream_output_formatter if include_details else lmi_non_stream_output_formatter
184190 stream_output_formatter = lmi_with_details_stream_output_formatter if include_details else lmi_stream_output_formatter
@@ -242,20 +248,22 @@ async def inference(
242248 return output
243249
244250 if processed_request .lora_request :
251+ logger .info (f"Processing LoRA request: { processed_request .lora_request .lora_name } " )
245252 original_add_request = self .vllm_engine .add_request
246253
247254 async def add_request_with_lora (* args , ** kwargs ):
248255 kwargs ['lora_request' ] = processed_request .lora_request
249256 return await original_add_request (* args , ** kwargs )
250257
251258 self .vllm_engine .add_request = add_request_with_lora
252-
253- try :
259+ try :
260+ response = await processed_request .inference_invoker (
261+ processed_request .vllm_request )
262+ finally :
263+ self .vllm_engine .add_request = original_add_request
264+ else :
254265 response = await processed_request .inference_invoker (
255266 processed_request .vllm_request )
256- finally :
257- if processed_request .lora_request :
258- self .vllm_engine .add_request = original_add_request
259267
260268 if isinstance (response , types .AsyncGeneratorType ):
261269 # Apply custom formatter to streaming response
0 commit comments