55"""
66
77import logging
8+ from enum import Enum
89from typing import Any , Dict , Literal , Optional
910
1011from pydantic import AnyUrl , Field
@@ -105,6 +106,48 @@ class MlxVlmEngineOptions(BaseVlmEngineOptions):
105106# =============================================================================
106107
107108
109+ class VllmCudaGraphMode (str , Enum ):
110+ """CUDA graph capture mode for the vLLM v1 engine.
111+
112+ Controls whether and how vLLM captures CUDA graphs to speed up inference.
113+ CUDA graphs reduce kernel-launch overhead by replaying a recorded sequence
114+ of CUDA operations instead of launching each kernel individually.
115+
116+ NONE:
117+ Disable CUDA graphs entirely; everything runs in eager mode.
118+ Fastest startup, lowest steady-state throughput.
119+ Best for short-lived processes, notebooks, and debugging.
120+
121+ FULL:
122+ Capture the entire forward pass as one monolithic CUDA graph.
123+ Maximum graph coverage but requires very static execution shapes;
124+ may fail with some models or dynamic workloads.
125+
126+ PIECEWISE:
127+ Capture segments of the model (e.g. transformer blocks) as multiple
128+ smaller graphs between selected ops. Handles dynamic shapes better
129+ than FULL while still accelerating most of the forward pass.
130+
131+ FULL_AND_PIECEWISE:
132+ Hybrid mode (default in many vLLM versions): FULL graphs for
133+ decode-only batches; PIECEWISE graphs for prefill and mixed
134+ prefill+decode batches. Usually the best throughput option for
135+ typical LLM serving workloads.
136+
137+ FULL_DECODE_ONLY:
138+ FULL CUDA graphs only for decode batches; prefill and mixed batches
139+ run in eager mode. Dramatically reduces graph-capture time and
140+ memory footprint compared to FULL_AND_PIECEWISE while still
141+ accelerating token generation.
142+ """
143+
144+ NONE = "NONE"
145+ FULL = "FULL"
146+ PIECEWISE = "PIECEWISE"
147+ FULL_AND_PIECEWISE = "FULL_AND_PIECEWISE"
148+ FULL_DECODE_ONLY = "FULL_DECODE_ONLY"
149+
150+
108151class VllmVlmEngineOptions (BaseVlmEngineOptions ):
109152 """Options for vLLM inference engine (high-throughput serving)."""
110153
@@ -126,6 +169,14 @@ class VllmVlmEngineOptions(BaseVlmEngineOptions):
126169 default = False , description = "Allow execution of custom code from model repo"
127170 )
128171
172+ cudagraph_mode : VllmCudaGraphMode = Field (
173+ default = VllmCudaGraphMode .PIECEWISE ,
174+ description = (
175+ "CUDA graph capture mode (vLLM v1 engine only). "
176+ "See VllmCudaGraphMode for the available options and their trade-offs."
177+ ),
178+ )
179+
129180
130181# =============================================================================
131182# API ENGINE OPTIONS
0 commit comments