Skip to content

Commit 4019df3

Browse files
committed
Fix vLLM worker
1 parent 4769488 commit 4019df3

File tree

3 files changed

+56
-3
lines changed

3 files changed

+56
-3
lines changed

sdks/python/apache_beam/examples/inference/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,12 @@ python -m apache_beam.examples.inference.vllm_text_completion \
930930
931931
Make sure to enable the 5xx driver since vLLM only works with 5xx drivers, not 4xx.
932932
933+
On GPUs with about 16GiB of memory (for example NVIDIA T4), vLLM’s defaults can fail
934+
during engine startup with CUDA out of memory. The example therefore passes conservative
935+
``--max-num-seqs`` and ``--gpu-memory-utilization`` values by default (overridable with
936+
``--vllm_max_num_seqs`` and ``--vllm_gpu_memory_utilization``) via
937+
`vllm_server_kwargs`, matching the pattern used in other vLLM examples.
938+
933939
This writes the output to the output file location with contents like:
934940
935941
```

sdks/python/apache_beam/examples/inference/vllm_text_completion.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import argparse
2727
import logging
2828
from collections.abc import Iterable
29+
from typing import Optional
2930

3031
import apache_beam as beam
3132
from apache_beam.ml.inference.base import PredictionResult
@@ -37,6 +38,12 @@
3738
from apache_beam.options.pipeline_options import SetupOptions
3839
from apache_beam.runners.runner import PipelineResult
3940

41+
# Defaults avoid CUDA OOM on ~16GB GPUs (e.g. NVIDIA T4) with vLLM V1: the engine
42+
# warms the sampler with many dummy sequences unless max_num_seqs is reduced, and
43+
# the default gpu_memory_utilization can leave no free VRAM for that step.
44+
_DEFAULT_VLLM_MAX_NUM_SEQS = 32
45+
_DEFAULT_VLLM_GPU_MEMORY_UTILIZATION = 0.72
46+
4047
COMPLETION_EXAMPLES = [
4148
"Hello, my name is",
4249
"The president of the United States is",
@@ -112,33 +119,71 @@ def parse_known_args(argv):
112119
required=False,
113120
default=None,
114121
help='Chat template to use for chat example.')
122+
parser.add_argument(
123+
'--vllm_max_num_seqs',
124+
dest='vllm_max_num_seqs',
125+
type=int,
126+
default=_DEFAULT_VLLM_MAX_NUM_SEQS,
127+
help=(
128+
'Passed to the vLLM OpenAI server as --max-num-seqs. '
129+
'Lower values use less GPU memory during startup and inference; '
130+
'required for many ~16GB GPUs (see --vllm_gpu_memory_utilization).'))
131+
parser.add_argument(
132+
'--vllm_gpu_memory_utilization',
133+
dest='vllm_gpu_memory_utilization',
134+
type=float,
135+
default=_DEFAULT_VLLM_GPU_MEMORY_UTILIZATION,
136+
help=(
137+
'Passed to the vLLM OpenAI server as --gpu-memory-utilization '
138+
'(fraction of total GPU memory for KV cache). Lower this if the '
139+
'engine fails to start with CUDA out of memory.'))
115140
return parser.parse_known_args(argv)
116141

117142

143+
def build_vllm_server_kwargs(known_args) -> dict[str, str]:
144+
"""Returns CLI flags for ``VLLMCompletionsModelHandler(..., vllm_server_kwargs=...)``."""
145+
return {
146+
'max-num-seqs': str(known_args.vllm_max_num_seqs),
147+
'gpu-memory-utilization': str(known_args.vllm_gpu_memory_utilization),
148+
}
149+
150+
118151
class PostProcessor(beam.DoFn):
119152
def process(self, element: PredictionResult) -> Iterable[str]:
120153
yield str(element.example) + ": " + str(element.inference)
121154

122155

123156
def run(
124-
argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult:
157+
argv=None,
158+
save_main_session=True,
159+
test_pipeline=None,
160+
vllm_server_kwargs: Optional[dict[str, str]] = None) -> PipelineResult:
125161
"""
126162
Args:
127163
argv: Command line arguments defined for this example.
128164
save_main_session: Used for internal testing.
129165
test_pipeline: Used for internal testing.
166+
vllm_server_kwargs: Optional override for vLLM server options. When None,
167+
options are taken from argv (``--vllm_max_num_seqs``,
168+
``--vllm_gpu_memory_utilization``). Used by tests.
130169
"""
131170
known_args, pipeline_args = parse_known_args(argv)
132171
pipeline_options = PipelineOptions(pipeline_args)
133172
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
134173

135-
model_handler = VLLMCompletionsModelHandler(model_name=known_args.model)
174+
effective_vllm_kwargs = (
175+
vllm_server_kwargs if vllm_server_kwargs is not None else
176+
build_vllm_server_kwargs(known_args))
177+
178+
model_handler = VLLMCompletionsModelHandler(
179+
model_name=known_args.model, vllm_server_kwargs=effective_vllm_kwargs)
136180
input_examples = COMPLETION_EXAMPLES
137181

138182
if known_args.chat:
139183
model_handler = VLLMChatModelHandler(
140184
model_name=known_args.model,
141-
chat_template_path=known_args.chat_template)
185+
chat_template_path=known_args.chat_template,
186+
vllm_server_kwargs=dict(effective_vllm_kwargs))
142187
input_examples = CHAT_EXAMPLES
143188

144189
pipeline = test_pipeline

sdks/python/apache_beam/ml/inference/vllm_inference.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ def __init__(
199199
`python -m vllm.entrypoints.openai.api_serverv <beam provided args>
200200
<vllm_server_kwargs>`. For example, you could pass
201201
`{'echo': 'true'}` to prepend new messages with the previous message.
202+
On ~16GB GPUs, pass lower ``max-num-seqs`` and ``gpu-memory-utilization``
203+
values (see ``apache_beam.examples.inference.vllm_text_completion``).
202204
For a list of possible kwargs, see
203205
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
204206
min_batch_size: optional. the minimum batch size to use when batching

0 commit comments

Comments
 (0)