harden things a bit

tdoublep · tdoublep · commit bad4c589cb0d · 2025-08-10T10:20:35.000-04:00
Signed-off-by: Thomas Parnell &lt;tpa@zurich.ibm.com&gt;
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -1678,6 +1678,10 @@ def is_attention_free(self) -> bool:
     def is_hybrid(self) -> bool:
         return self._model_info.is_hybrid
 
+    @property
+    def has_mamba2(self) -> bool:
+        return self._model_info.has_mamba2
+
     @property
     def has_noops(self) -> bool:
         return self._model_info.has_noops
@@ -4215,14 +4219,18 @@ def try_verify_and_update_config(self):
             return
 
         from vllm.model_executor.models.config import (
-            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
+            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig,
+            Mamba2ModelConfig)
         cls = MODELS_CONFIG_MAP.get(architecture, None)
         if cls is not None:
             cls.verify_and_update_config(self)
 
         if self.model_config.is_hybrid:
             HybridAttentionMambaModelConfig.verify_and_update_config(self)
 
+        if self.model_config.has_mamba2:
+            Mamba2ModelConfig.verify_and_update_config(self)
+
         if self.model_config.convert_type == "classify":
             # Maybe convert ForCausalLM into ForSequenceClassification model.
             from vllm.model_executor.models.adapters import (
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -214,7 +214,7 @@ class CompilationConfig:
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
     internally managed buffer. Default is False."""
-    full_cuda_graph: bool = False
+    full_cuda_graph: Optional[bool] = None
     """whether to use a full cuda graph for the entire forward pass rather than
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
@@ -344,7 +344,8 @@ def __post_init__(self) -> None:
     def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
-
+        if self.full_cuda_graph is None:
+            self.full_cuda_graph = False
         from torch._dynamo.backends.registry import list_backends
         torch_backends = list_backends(exclude_tags=tuple())
         if self.level in [
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
@@ -38,8 +38,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant)
+from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA,
+                         SupportsPP, SupportsQuant)
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -420,7 +420,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsQuant):
+                       IsHybrid, SupportsQuant, HasMamba2):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -275,6 +275,25 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
                     "%d for performance.", 1024)
 
 
+class Mamba2ModelConfig(VerifyAndUpdateConfig):
+
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Enable full cuda graphs for decode-only batches to ensure that
+        V1 performance matches that of V0.
+
+        Args:
+            vllm_config: vLLM Config
+        """
+        if not envs.VLLM_USE_V1:
+            return
+
+        compilation_config = vllm_config.compilation_config
+        if compilation_config.full_cuda_graph is None:
+            compilation_config.full_cuda_graph = True
+
+
 class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
 
     @classmethod
@@ -296,7 +315,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         cache_config = vllm_config.cache_config
         model_config = vllm_config.model_config
         parallel_config = vllm_config.parallel_config
-        compilation_config = vllm_config.compilation_config
 
         if cache_config.cache_dtype == "auto":
             kv_cache_dtype = model_config.dtype
@@ -362,11 +380,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 "that mamba page size and attention page size are "
                 "exactly equal.", mamba_padding_pct)
 
-        # enable full cuda graphs for decode-only batches
-        # note (tdoublep): this is currently necessary to
-        # match V0 performance
-        compilation_config.full_cuda_graph = True
-
 
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
@@ -36,7 +36,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA,
+                         SupportsPP)
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -507,7 +508,7 @@ def forward(
 
 
 class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                          IsHybrid):
+                          IsHybrid, HasMamba2):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
@@ -38,8 +38,8 @@
 
 from .granitemoe import GraniteMoeMoE
 from .granitemoeshared import GraniteMoeSharedMLP
-from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsQuant)
+from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA,
+                         SupportsPP, SupportsQuant)
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -513,7 +513,8 @@ def _load_expert(n, p, name, shard_id, expert_id):
 
 
 class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
-                                  SupportsPP, IsHybrid, SupportsQuant):
+                                  SupportsPP, IsHybrid, SupportsQuant,
+                                  HasMamba2):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -468,6 +468,33 @@ def is_attention_free(
     return getattr(model, "is_attention_free", False)
 
 
+@runtime_checkable
+class HasMamba2(Protocol):
+    """The interface required for all models like mamba2, bamba, zamba2,
+    etc., that have mamba2 blocks"""
+
+    has_mamba2: ClassVar[Literal[True]] = True
+    """
+        A flag that indicates if the model has mamba2 blocks.
+    """
+
+
+@overload
+def has_mamba2(model: object) -> TypeIs[HasMamba2]:
+    ...
+
+
+@overload
+def has_mamba2(model: type[object]) -> TypeIs[type[HasMamba2]]:
+    ...
+
+
+def has_mamba2(
+    model: Union[type[object], object]
+) -> Union[TypeIs[type[HasMamba2]], TypeIs[HasMamba2]]:
+    return getattr(model, "has_mamba2", False)
+
+
 @runtime_checkable
 class IsHybrid(Protocol):
     """The interface required for all models like Jamba that have both
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
@@ -26,7 +26,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (HasInnerState,
+from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2,
                                                    IsAttentionFree)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
@@ -198,7 +198,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, HasMamba2):
 
     @classmethod
     def get_mamba_state_shape_from_config(
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
@@ -45,9 +45,9 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid,
-                                                   SupportsLoRA, SupportsPP,
-                                                   SupportsQuant)
+from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2,
+                                                   IsHybrid, SupportsLoRA,
+                                                   SupportsPP, SupportsQuant)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.models.utils import (
@@ -446,7 +446,7 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                           IsHybrid, SupportsQuant):
+                           IsHybrid, SupportsQuant, HasMamba2):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -25,8 +25,8 @@
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
-from .interfaces import (has_inner_state, has_noops, is_attention_free,
-                         is_hybrid, supports_cross_encoding,
+from .interfaces import (has_inner_state, has_mamba2, has_noops,
+                         is_attention_free, is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
 from .interfaces_base import is_pooling_model, is_text_generation_model
@@ -312,6 +312,7 @@ class _ModelInfo:
     has_inner_state: bool
     is_attention_free: bool
     is_hybrid: bool
+    has_mamba2: bool
     has_noops: bool
     supports_transcription: bool
     supports_transcription_only: bool
@@ -329,6 +330,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
+            has_mamba2=has_mamba2(model),
             is_hybrid=is_hybrid(model),
             supports_transcription=supports_transcription(model),
             supports_transcription_only=(supports_transcription(model) and
@@ -760,6 +762,14 @@ def is_hybrid_model(
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.is_hybrid
 
+    def model_has_mamba2(
+        self,
+        architectures: Union[str, list[str]],
+        model_config: ModelConfig,
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures, model_config)
+        return model_cls.has_mamba2
+
     def is_noops_model(
         self,
         architectures: Union[str, list[str]],
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
@@ -44,7 +44,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import HasInnerState, IsHybrid
+from .interfaces import HasInnerState, HasMamba2, IsHybrid
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -832,7 +832,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
+class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, HasMamba2):
     """Zamba2 model with causal language modeling head.
     
     This class wraps the core Zamba2 model and adds: