From f70e3984219d1cb273923f00e2bac7e27babd6b1 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sun, 10 Aug 2025 09:23:54 -0400 Subject: [PATCH 1/3] Enable FCG by defauly for hybrid models in V1 Signed-off-by: Thomas Parnell --- vllm/model_executor/models/config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f21cd267b0e..601433953040 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -296,6 +296,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: cache_config = vllm_config.cache_config model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config + compilation_config = vllm_config.compilation_config if cache_config.cache_dtype == "auto": kv_cache_dtype = model_config.dtype @@ -361,6 +362,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: "that mamba page size and attention page size are " "exactly equal.", mamba_padding_pct) + # enable full cuda graphs for decode-only batches + # note (tdoublep): this is currently necessary to + # match V0 performance + compilation_config.full_cuda_graph = True + MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, From bad4c589cb0d9d4028a762f5938d4269dde7ab2f Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sun, 10 Aug 2025 10:20:35 -0400 Subject: [PATCH 2/3] harden things a bit Signed-off-by: Thomas Parnell --- vllm/config/__init__.py | 10 ++++++- vllm/config/compilation.py | 5 ++-- vllm/model_executor/models/bamba.py | 6 ++--- vllm/model_executor/models/config.py | 25 ++++++++++++----- vllm/model_executor/models/falcon_h1.py | 5 ++-- .../model_executor/models/granitemoehybrid.py | 7 ++--- vllm/model_executor/models/interfaces.py | 27 +++++++++++++++++++ vllm/model_executor/models/mamba2.py | 4 +-- vllm/model_executor/models/nemotron_h.py | 8 +++--- vllm/model_executor/models/registry.py | 14 ++++++++-- vllm/model_executor/models/zamba2.py | 4 +-- 11 files changed, 88 insertions(+), 27 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 49da3fd848ec..a7240491ac66 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1678,6 +1678,10 @@ def is_attention_free(self) -> bool: def is_hybrid(self) -> bool: return self._model_info.is_hybrid + @property + def has_mamba2(self) -> bool: + return self._model_info.has_mamba2 + @property def has_noops(self) -> bool: return self._model_info.has_noops @@ -4215,7 +4219,8 @@ def try_verify_and_update_config(self): return from vllm.model_executor.models.config import ( - MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig) + MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig, + Mamba2ModelConfig) cls = MODELS_CONFIG_MAP.get(architecture, None) if cls is not None: cls.verify_and_update_config(self) @@ -4223,6 +4228,9 @@ def try_verify_and_update_config(self): if self.model_config.is_hybrid: HybridAttentionMambaModelConfig.verify_and_update_config(self) + if self.model_config.has_mamba2: + Mamba2ModelConfig.verify_and_update_config(self) + if self.model_config.convert_type == "classify": # Maybe convert ForCausalLM into ForSequenceClassification model. from vllm.model_executor.models.adapters import ( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 8a78d811b9a2..b44569b65a0d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -214,7 +214,7 @@ class CompilationConfig: are always used, it can set this to False. Otherwise, it should set this to True, and the compiler will copy the input to an internally managed buffer. Default is False.""" - full_cuda_graph: bool = False + full_cuda_graph: Optional[bool] = None """whether to use a full cuda graph for the entire forward pass rather than splitting certain operations such as attention into subgraphs. Thus this flag cannot be used together with splitting_ops. This may provide @@ -344,7 +344,8 @@ def __post_init__(self) -> None: def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") - + if self.full_cuda_graph is None: + self.full_cuda_graph = False from torch._dynamo.backends.registry import list_backends torch_backends = list_backends(exclude_tags=tuple()) if self.level in [ diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4a2ae07581f3..f93f94f01423 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -38,8 +38,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant) +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -420,7 +420,7 @@ def load_weights(self, weights: Iterable[tuple[str, class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsQuant): + IsHybrid, SupportsQuant, HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 601433953040..a23d54e08457 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -275,6 +275,25 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "%d for performance.", 1024) +class Mamba2ModelConfig(VerifyAndUpdateConfig): + + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Enable full cuda graphs for decode-only batches to ensure that + V1 performance matches that of V0. + + Args: + vllm_config: vLLM Config + """ + if not envs.VLLM_USE_V1: + return + + compilation_config = vllm_config.compilation_config + if compilation_config.full_cuda_graph is None: + compilation_config.full_cuda_graph = True + + class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod @@ -296,7 +315,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: cache_config = vllm_config.cache_config model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config - compilation_config = vllm_config.compilation_config if cache_config.cache_dtype == "auto": kv_cache_dtype = model_config.dtype @@ -362,11 +380,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: "that mamba page size and attention page size are " "exactly equal.", mamba_padding_pct) - # enable full cuda graphs for decode-only batches - # note (tdoublep): this is currently necessary to - # match V0 performance - compilation_config.full_cuda_graph = True - MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteModel": SnowflakeGteNewModelConfig, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 85d64af5bd28..2f8ada88c18a 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -36,7 +36,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP) from .utils import (PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -507,7 +508,7 @@ def forward( class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid): + IsHybrid, HasMamba2): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index e59502f12a1c..0740ab8da005 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -38,8 +38,8 @@ from .granitemoe import GraniteMoeMoE from .granitemoeshared import GraniteMoeSharedMLP -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant) +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -513,7 +513,8 @@ def _load_expert(n, p, name, shard_id, expert_id): class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, - SupportsPP, IsHybrid, SupportsQuant): + SupportsPP, IsHybrid, SupportsQuant, + HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b6d9877cd01b..66d03b3364e4 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -468,6 +468,33 @@ def is_attention_free( return getattr(model, "is_attention_free", False) +@runtime_checkable +class HasMamba2(Protocol): + """The interface required for all models like mamba2, bamba, zamba2, + etc., that have mamba2 blocks""" + + has_mamba2: ClassVar[Literal[True]] = True + """ + A flag that indicates if the model has mamba2 blocks. + """ + + +@overload +def has_mamba2(model: object) -> TypeIs[HasMamba2]: + ... + + +@overload +def has_mamba2(model: type[object]) -> TypeIs[type[HasMamba2]]: + ... + + +def has_mamba2( + model: Union[type[object], object] +) -> Union[TypeIs[type[HasMamba2]], TypeIs[HasMamba2]]: + return getattr(model, "has_mamba2", False) + + @runtime_checkable class IsHybrid(Protocol): """The interface required for all models like Jamba that have both diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 75e92b01762d..9d36539e841e 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -26,7 +26,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (HasInnerState, +from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2, IsAttentionFree) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) @@ -198,7 +198,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): +class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, HasMamba2): @classmethod def get_mamba_state_shape_from_config( diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index eb62d5a53c1a..53108186ed75 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -45,9 +45,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, - SupportsLoRA, SupportsPP, - SupportsQuant) +from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2, + IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.models.utils import ( @@ -446,7 +446,7 @@ def load_weights(self, weights: Iterable[tuple[str, class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsQuant): + IsHybrid, SupportsQuant, HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index aca3d84f0071..3f0df25dd52c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -25,8 +25,8 @@ from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) -from .interfaces import (has_inner_state, has_noops, is_attention_free, - is_hybrid, supports_cross_encoding, +from .interfaces import (has_inner_state, has_mamba2, has_noops, + is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import is_pooling_model, is_text_generation_model @@ -312,6 +312,7 @@ class _ModelInfo: has_inner_state: bool is_attention_free: bool is_hybrid: bool + has_mamba2: bool has_noops: bool supports_transcription: bool supports_transcription_only: bool @@ -329,6 +330,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), is_attention_free=is_attention_free(model), + has_mamba2=has_mamba2(model), is_hybrid=is_hybrid(model), supports_transcription=supports_transcription(model), supports_transcription_only=(supports_transcription(model) and @@ -760,6 +762,14 @@ def is_hybrid_model( model_cls, _ = self.inspect_model_cls(architectures, model_config) return model_cls.is_hybrid + def model_has_mamba2( + self, + architectures: Union[str, list[str]], + model_config: ModelConfig, + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures, model_config) + return model_cls.has_mamba2 + def is_noops_model( self, architectures: Union[str, list[str]], diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 4cb0becf302f..31141ecb7b38 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -44,7 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid +from .interfaces import HasInnerState, HasMamba2, IsHybrid from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -832,7 +832,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): +class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, HasMamba2): """Zamba2 model with causal language modeling head. This class wraps the core Zamba2 model and adds: From 8d164ad7497a8beebf15a7d62f204237d6650704 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sun, 10 Aug 2025 10:22:45 -0400 Subject: [PATCH 3/3] minor diff Signed-off-by: Thomas Parnell --- vllm/config/compilation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index b44569b65a0d..5c2f538e1fa3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -344,6 +344,7 @@ def __post_init__(self) -> None: def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") + if self.full_cuda_graph is None: self.full_cuda_graph = False from torch._dynamo.backends.registry import list_backends