Skip to content

Commit a5251b8

Browse files
committed
[WIP] Add vllm Dynamo support
1 parent 2110932 commit a5251b8

File tree

2 files changed

+33
-6
lines changed

2 files changed

+33
-6
lines changed

sdks/python/apache_beam/ml/inference/vllm_inference.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,20 @@ def getAsyncVLLMClient(port) -> AsyncOpenAI:
109109

110110

111111
class _VLLMModelServer():
112-
def __init__(self, model_name: str, vllm_server_kwargs: dict[str, str]):
112+
def __init__(
113+
self,
114+
model_name: str,
115+
vllm_server_kwargs: dict[str, str],
116+
vllm_executable: Optional[str] = None):
113117
self._model_name = model_name
114118
self._vllm_server_kwargs = vllm_server_kwargs
115119
self._server_started = False
116120
self._server_process = None
117121
self._server_port: int = -1
118122
self._server_process_lock = threading.RLock()
123+
self._vllm_executable = 'vllm.entrypoints.openai.api_server'
124+
if vllm_executable is not None:
125+
self._vllm_executable = vllm_executable
119126

120127
self.start_server()
121128

@@ -125,7 +132,7 @@ def start_server(self, retries=3):
125132
server_cmd = [
126133
sys.executable,
127134
'-m',
128-
'vllm.entrypoints.openai.api_server',
135+
self._vllm_executable,
129136
'--model',
130137
self._model_name,
131138
'--port',
@@ -175,7 +182,8 @@ class VLLMCompletionsModelHandler(ModelHandler[str,
175182
def __init__(
176183
self,
177184
model_name: str,
178-
vllm_server_kwargs: Optional[dict[str, str]] = None):
185+
vllm_server_kwargs: Optional[dict[str, str]] = None,
186+
use_dynamo: bool = False):
179187
"""Implementation of the ModelHandler interface for vLLM using text as
180188
input.
181189
@@ -194,13 +202,22 @@ def __init__(
194202
`{'echo': 'true'}` to prepend new messages with the previous message.
195203
For a list of possible kwargs, see
196204
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
205+
use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
206+
Requires installing dynamo in your runtime environment
207+
(`pip install ai-dynamo[vllm]`)
197208
"""
198209
self._model_name = model_name
199210
self._vllm_server_kwargs: dict[str, str] = vllm_server_kwargs or {}
200211
self._env_vars = {}
212+
self._vllm_executable = None
213+
if use_dynamo:
214+
self._vllm_executable = 'dynamo.vllm'
201215

202216
def load_model(self) -> _VLLMModelServer:
203-
return _VLLMModelServer(self._model_name, self._vllm_server_kwargs)
217+
return _VLLMModelServer(
218+
self._model_name,
219+
self._vllm_server_kwargs,
220+
self._vllm_executable)
204221

205222
async def _async_run_inference(
206223
self,
@@ -253,7 +270,8 @@ def __init__(
253270
self,
254271
model_name: str,
255272
chat_template_path: Optional[str] = None,
256-
vllm_server_kwargs: Optional[dict[str, str]] = None):
273+
vllm_server_kwargs: Optional[dict[str, str]] = None,
274+
use_dynamo: bool = False):
257275
""" Implementation of the ModelHandler interface for vLLM using previous
258276
messages as input.
259277
@@ -277,12 +295,17 @@ def __init__(
277295
`{'echo': 'true'}` to prepend new messages with the previous message.
278296
For a list of possible kwargs, see
279297
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api
298+
use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
299+
Requires installing dynamo in your runtime environment
300+
(`pip install ai-dynamo[vllm]`)
280301
"""
281302
self._model_name = model_name
282303
self._vllm_server_kwargs: dict[str, str] = vllm_server_kwargs or {}
283304
self._env_vars = {}
284305
self._chat_template_path = chat_template_path
285306
self._chat_file = f'template-{uuid.uuid4().hex}.jinja'
307+
if use_dynamo:
308+
self._vllm_executable = 'dynamo.vllm'
286309

287310
def load_model(self) -> _VLLMModelServer:
288311
chat_template_contents = ''
@@ -295,7 +318,10 @@ def load_model(self) -> _VLLMModelServer:
295318
f.write(chat_template_contents)
296319
self._vllm_server_kwargs['chat_template'] = local_chat_template_path
297320

298-
return _VLLMModelServer(self._model_name, self._vllm_server_kwargs)
321+
return _VLLMModelServer(
322+
self._model_name,
323+
self._vllm_server_kwargs,
324+
self._vllm_executable)
299325

300326
async def _async_run_inference(
301327
self,

sdks/python/apache_beam/ml/inference/vllm_tests_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ pillow>=8.0.0
2020
transformers>=4.18.0
2121
google-cloud-monitoring>=2.27.0
2222
openai>=1.52.2
23+
ai-dynamo[vllm]>=0.1.1

0 commit comments

Comments
 (0)