@@ -109,13 +109,20 @@ def getAsyncVLLMClient(port) -> AsyncOpenAI:
109109
110110
111111class _VLLMModelServer ():
112- def __init__ (self , model_name : str , vllm_server_kwargs : dict [str , str ]):
112+ def __init__ (
113+ self ,
114+ model_name : str ,
115+ vllm_server_kwargs : dict [str , str ],
116+ vllm_executable : Optional [str ] = None ):
113117 self ._model_name = model_name
114118 self ._vllm_server_kwargs = vllm_server_kwargs
115119 self ._server_started = False
116120 self ._server_process = None
117121 self ._server_port : int = - 1
118122 self ._server_process_lock = threading .RLock ()
123+ self ._vllm_executable = 'vllm.entrypoints.openai.api_server'
124+ if vllm_executable is not None :
125+ self ._vllm_executable = vllm_executable
119126
120127 self .start_server ()
121128
@@ -125,7 +132,7 @@ def start_server(self, retries=3):
125132 server_cmd = [
126133 sys .executable ,
127134 '-m' ,
128- 'vllm.entrypoints.openai.api_server' ,
135+ self . _vllm_executable ,
129136 '--model' ,
130137 self ._model_name ,
131138 '--port' ,
@@ -175,7 +182,8 @@ class VLLMCompletionsModelHandler(ModelHandler[str,
175182 def __init__ (
176183 self ,
177184 model_name : str ,
178- vllm_server_kwargs : Optional [dict [str , str ]] = None ):
185+ vllm_server_kwargs : Optional [dict [str , str ]] = None ,
186+ use_dynamo : bool = False ):
179187 """Implementation of the ModelHandler interface for vLLM using text as
180188 input.
181189
@@ -194,13 +202,22 @@ def __init__(
194202 `{'echo': 'true'}` to prepend new messages with the previous message.
195203 For a list of possible kwargs, see
196204 https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
205+ use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
206+ Requires installing dynamo in your runtime environment
207+ (`pip install ai-dynamo[vllm]`)
197208 """
198209 self ._model_name = model_name
199210 self ._vllm_server_kwargs : dict [str , str ] = vllm_server_kwargs or {}
200211 self ._env_vars = {}
212+ self ._vllm_executable = None
213+ if use_dynamo :
214+ self ._vllm_executable = 'dynamo.vllm'
201215
202216 def load_model (self ) -> _VLLMModelServer :
203- return _VLLMModelServer (self ._model_name , self ._vllm_server_kwargs )
217+ return _VLLMModelServer (
218+ self ._model_name ,
219+ self ._vllm_server_kwargs ,
220+ self ._vllm_executable )
204221
205222 async def _async_run_inference (
206223 self ,
@@ -253,7 +270,8 @@ def __init__(
253270 self ,
254271 model_name : str ,
255272 chat_template_path : Optional [str ] = None ,
256- vllm_server_kwargs : Optional [dict [str , str ]] = None ):
273+ vllm_server_kwargs : Optional [dict [str , str ]] = None ,
274+ use_dynamo : bool = False ):
257275 """ Implementation of the ModelHandler interface for vLLM using previous
258276 messages as input.
259277
@@ -277,12 +295,17 @@ def __init__(
277295 `{'echo': 'true'}` to prepend new messages with the previous message.
278296 For a list of possible kwargs, see
279297 https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api
298+ use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
299+ Requires installing dynamo in your runtime environment
300+ (`pip install ai-dynamo[vllm]`)
280301 """
281302 self ._model_name = model_name
282303 self ._vllm_server_kwargs : dict [str , str ] = vllm_server_kwargs or {}
283304 self ._env_vars = {}
284305 self ._chat_template_path = chat_template_path
285306 self ._chat_file = f'template-{ uuid .uuid4 ().hex } .jinja'
307+ if use_dynamo :
308+ self ._vllm_executable = 'dynamo.vllm'
286309
287310 def load_model (self ) -> _VLLMModelServer :
288311 chat_template_contents = ''
@@ -295,7 +318,10 @@ def load_model(self) -> _VLLMModelServer:
295318 f .write (chat_template_contents )
296319 self ._vllm_server_kwargs ['chat_template' ] = local_chat_template_path
297320
298- return _VLLMModelServer (self ._model_name , self ._vllm_server_kwargs )
321+ return _VLLMModelServer (
322+ self ._model_name ,
323+ self ._vllm_server_kwargs ,
324+ self ._vllm_executable )
299325
300326 async def _async_run_inference (
301327 self ,
0 commit comments