Enable FCG by defauly for hybrid models in V1

tdoublep · tdoublep · commit f70e3984219d · 2025-08-10T09:23:54.000-04:00
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -296,6 +296,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
+        compilation_config = vllm_config.compilation_config
 
         if cache_config.cache_dtype == "auto":
             kv_cache_dtype = model_config.dtype
@@ -361,6 +362,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 "that mamba page size and attention page size are "
                 "exactly equal.", mamba_padding_pct)
 
+        # enable full cuda graphs for decode-only batches
+        # note (tdoublep): this is currently necessary to
+        # match V0 performance
+        compilation_config.full_cuda_graph = True
+
 
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,