5353from vllm .entrypoints .openai .protocol import ChatCompletionRequest , CompletionRequest , ErrorResponse
5454from vllm .entrypoints .openai .serving_chat import OpenAIServingChat
5555from vllm .entrypoints .openai .serving_completion import OpenAIServingCompletion
56- from vllm .entrypoints .openai .serving_engine import BaseModelPath
56+ from vllm .entrypoints .openai .serving_models import BaseModelPath , OpenAIServingModels
5757from vllm .utils import merge_async_iterators
5858
5959CONFIG_FILE = os .getenv ("CONFIG_FILE" )
@@ -202,7 +202,7 @@ def determine_max_concurrent_requests(
202202 # anecdotally, we're seeing the engine able to handle around 7req/s (for outlines), so set to 30 * 7 ~= 200
203203 if any (
204204 request .to_sampling_params (
205- default_max_tokens = 1 , logits_processor_pattern = None
205+ max_tokens = 1 , logits_processor_pattern = None , default_sampling_params = {}
206206 ).guided_decoding
207207 for request in requests
208208 ):
@@ -294,7 +294,6 @@ async def init_engine(
294294 os .environ .get ("NUM_INSTANCES" , 1 )
295295 ), # TODO maybe do something other than TP=8, PP=number of nodes
296296 seed = request .model_cfg .seed or 0 ,
297- disable_log_requests = True ,
298297 gpu_memory_utilization = request .max_gpu_memory_utilization or 0.9 ,
299298 )
300299 default_engine_args_dict .update (engine_args_dict )
@@ -304,15 +303,21 @@ async def init_engine(
304303 engine_client = AsyncLLMEngine .from_engine_args (engine_args )
305304 model_config = await engine_client .get_model_config ()
306305 resolved_chat_template = load_chat_template (parsed_configs .chat_template )
306+
307307 base_model_paths = [BaseModelPath (name = served_model_name , model_path = model_id )]
308308
309+ openai_serving_models = OpenAIServingModels (
310+ engine_client = engine_client ,
311+ model_config = model_config ,
312+ base_model_paths = base_model_paths ,
313+ )
314+ await openai_serving_models .init_static_loras ()
315+
309316 openai_serving_chat = OpenAIServingChat (
310317 engine_client ,
311318 model_config ,
312- base_model_paths ,
319+ openai_serving_models ,
313320 response_role = request .model_cfg .response_role or "assistant" ,
314- lora_modules = None ,
315- prompt_adapters = None ,
316321 request_logger = None ,
317322 chat_template = resolved_chat_template ,
318323 chat_template_content_format = None ,
@@ -321,9 +326,7 @@ async def init_engine(
321326 openai_serving_completion = OpenAIServingCompletion (
322327 engine_client ,
323328 model_config ,
324- base_model_paths ,
325- lora_modules = None ,
326- prompt_adapters = None ,
329+ openai_serving_models ,
327330 request_logger = None ,
328331 )
329332
0 commit comments