Skip to content

Commit f70e398

Browse files
committed
Enable FCG by defauly for hybrid models in V1
Signed-off-by: Thomas Parnell <[email protected]>
1 parent 00976db commit f70e398

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

vllm/model_executor/models/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
296296
cache_config = vllm_config.cache_config
297297
model_config = vllm_config.model_config
298298
parallel_config = vllm_config.parallel_config
299+
compilation_config = vllm_config.compilation_config
299300

300301
if cache_config.cache_dtype == "auto":
301302
kv_cache_dtype = model_config.dtype
@@ -361,6 +362,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
361362
"that mamba page size and attention page size are "
362363
"exactly equal.", mamba_padding_pct)
363364

365+
# enable full cuda graphs for decode-only batches
366+
# note (tdoublep): this is currently necessary to
367+
# match V0 performance
368+
compilation_config.full_cuda_graph = True
369+
364370

365371
MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
366372
"GteModel": SnowflakeGteNewModelConfig,

0 commit comments

Comments
 (0)