Skip to content

Commit f950679

Browse files
feat: add the VllmCudaGraphMode (#3125)
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
1 parent f73df4f commit f950679

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

docling/datamodel/vlm_engine_options.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"""
66

77
import logging
8+
from enum import Enum
89
from typing import Any, Dict, Literal, Optional
910

1011
from pydantic import AnyUrl, Field
@@ -105,6 +106,48 @@ class MlxVlmEngineOptions(BaseVlmEngineOptions):
105106
# =============================================================================
106107

107108

109+
class VllmCudaGraphMode(str, Enum):
110+
"""CUDA graph capture mode for the vLLM v1 engine.
111+
112+
Controls whether and how vLLM captures CUDA graphs to speed up inference.
113+
CUDA graphs reduce kernel-launch overhead by replaying a recorded sequence
114+
of CUDA operations instead of launching each kernel individually.
115+
116+
NONE:
117+
Disable CUDA graphs entirely; everything runs in eager mode.
118+
Fastest startup, lowest steady-state throughput.
119+
Best for short-lived processes, notebooks, and debugging.
120+
121+
FULL:
122+
Capture the entire forward pass as one monolithic CUDA graph.
123+
Maximum graph coverage but requires very static execution shapes;
124+
may fail with some models or dynamic workloads.
125+
126+
PIECEWISE:
127+
Capture segments of the model (e.g. transformer blocks) as multiple
128+
smaller graphs between selected ops. Handles dynamic shapes better
129+
than FULL while still accelerating most of the forward pass.
130+
131+
FULL_AND_PIECEWISE:
132+
Hybrid mode (default in many vLLM versions): FULL graphs for
133+
decode-only batches; PIECEWISE graphs for prefill and mixed
134+
prefill+decode batches. Usually the best throughput option for
135+
typical LLM serving workloads.
136+
137+
FULL_DECODE_ONLY:
138+
FULL CUDA graphs only for decode batches; prefill and mixed batches
139+
run in eager mode. Dramatically reduces graph-capture time and
140+
memory footprint compared to FULL_AND_PIECEWISE while still
141+
accelerating token generation.
142+
"""
143+
144+
NONE = "NONE"
145+
FULL = "FULL"
146+
PIECEWISE = "PIECEWISE"
147+
FULL_AND_PIECEWISE = "FULL_AND_PIECEWISE"
148+
FULL_DECODE_ONLY = "FULL_DECODE_ONLY"
149+
150+
108151
class VllmVlmEngineOptions(BaseVlmEngineOptions):
109152
"""Options for vLLM inference engine (high-throughput serving)."""
110153

@@ -126,6 +169,14 @@ class VllmVlmEngineOptions(BaseVlmEngineOptions):
126169
default=False, description="Allow execution of custom code from model repo"
127170
)
128171

172+
cudagraph_mode: VllmCudaGraphMode = Field(
173+
default=VllmCudaGraphMode.PIECEWISE,
174+
description=(
175+
"CUDA graph capture mode (vLLM v1 engine only). "
176+
"See VllmCudaGraphMode for the available options and their trade-offs."
177+
),
178+
)
179+
129180

130181
# =============================================================================
131182
# API ENGINE OPTIONS

docling/models/inference_engines/vlm/vllm_engine.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,16 @@ def download_wrapper(repo_id: str, revision: str) -> Path:
212212
"gpu_memory_utilization", self.options.gpu_memory_utilization
213213
)
214214

215+
# Apply CUDA graph capture mode
216+
from vllm.config.compilation import (
217+
CompilationConfig,
218+
CUDAGraphMode,
219+
)
220+
221+
llm_kwargs["compilation_config"] = CompilationConfig(
222+
cudagraph_mode=CUDAGraphMode[self.options.cudagraph_mode.value]
223+
)
224+
215225
# Quantization support (if specified in extra_config)
216226
if "quantization" in extra_cfg:
217227
llm_kwargs.setdefault("quantization", extra_cfg["quantization"])

0 commit comments

Comments
 (0)