diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 49da3fd848ec..a7240491ac66 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1678,6 +1678,10 @@ def is_attention_free(self) -> bool: def is_hybrid(self) -> bool: return self._model_info.is_hybrid + @property + def has_mamba2(self) -> bool: + return self._model_info.has_mamba2 + @property def has_noops(self) -> bool: return self._model_info.has_noops @@ -4215,7 +4219,8 @@ def try_verify_and_update_config(self): return from vllm.model_executor.models.config import ( - MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig) + MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig, + Mamba2ModelConfig) cls = MODELS_CONFIG_MAP.get(architecture, None) if cls is not None: cls.verify_and_update_config(self) @@ -4223,6 +4228,9 @@ def try_verify_and_update_config(self): if self.model_config.is_hybrid: HybridAttentionMambaModelConfig.verify_and_update_config(self) + if self.model_config.has_mamba2: + Mamba2ModelConfig.verify_and_update_config(self) + if self.model_config.convert_type == "classify": # Maybe convert ForCausalLM into ForSequenceClassification model. from vllm.model_executor.models.adapters import ( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 8a78d811b9a2..5c2f538e1fa3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -214,7 +214,7 @@ class CompilationConfig: are always used, it can set this to False. Otherwise, it should set this to True, and the compiler will copy the input to an internally managed buffer. Default is False.""" - full_cuda_graph: bool = False + full_cuda_graph: Optional[bool] = None """whether to use a full cuda graph for the entire forward pass rather than splitting certain operations such as attention into subgraphs. Thus this flag cannot be used together with splitting_ops. This may provide @@ -345,6 +345,8 @@ def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") + if self.full_cuda_graph is None: + self.full_cuda_graph = False from torch._dynamo.backends.registry import list_backends torch_backends = list_backends(exclude_tags=tuple()) if self.level in [ diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 4a2ae07581f3..f93f94f01423 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -38,8 +38,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant) +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -420,7 +420,7 @@ def load_weights(self, weights: Iterable[tuple[str, class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsQuant): + IsHybrid, SupportsQuant, HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f21cd267b0e..a23d54e08457 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -275,6 +275,25 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: "%d for performance.", 1024) +class Mamba2ModelConfig(VerifyAndUpdateConfig): + + @classmethod + def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: + """ + Enable full cuda graphs for decode-only batches to ensure that + V1 performance matches that of V0. + + Args: + vllm_config: vLLM Config + """ + if not envs.VLLM_USE_V1: + return + + compilation_config = vllm_config.compilation_config + if compilation_config.full_cuda_graph is None: + compilation_config.full_cuda_graph = True + + class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): @classmethod diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 85d64af5bd28..2f8ada88c18a 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -36,7 +36,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP) from .utils import (PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -507,7 +508,7 @@ def forward( class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid): + IsHybrid, HasMamba2): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index e59502f12a1c..0740ab8da005 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -38,8 +38,8 @@ from .granitemoe import GraniteMoeMoE from .granitemoeshared import GraniteMoeSharedMLP -from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP, - SupportsQuant) +from .interfaces import (HasInnerState, HasMamba2, IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -513,7 +513,8 @@ def _load_expert(n, p, name, shard_id, expert_id): class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, - SupportsPP, IsHybrid, SupportsQuant): + SupportsPP, IsHybrid, SupportsQuant, + HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b6d9877cd01b..66d03b3364e4 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -468,6 +468,33 @@ def is_attention_free( return getattr(model, "is_attention_free", False) +@runtime_checkable +class HasMamba2(Protocol): + """The interface required for all models like mamba2, bamba, zamba2, + etc., that have mamba2 blocks""" + + has_mamba2: ClassVar[Literal[True]] = True + """ + A flag that indicates if the model has mamba2 blocks. + """ + + +@overload +def has_mamba2(model: object) -> TypeIs[HasMamba2]: + ... + + +@overload +def has_mamba2(model: type[object]) -> TypeIs[type[HasMamba2]]: + ... + + +def has_mamba2( + model: Union[type[object], object] +) -> Union[TypeIs[type[HasMamba2]], TypeIs[HasMamba2]]: + return getattr(model, "has_mamba2", False) + + @runtime_checkable class IsHybrid(Protocol): """The interface required for all models like Jamba that have both diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 75e92b01762d..9d36539e841e 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -26,7 +26,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (HasInnerState, +from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2, IsAttentionFree) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) @@ -198,7 +198,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree): +class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, HasMamba2): @classmethod def get_mamba_state_shape_from_config( diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index eb62d5a53c1a..53108186ed75 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -45,9 +45,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, - SupportsLoRA, SupportsPP, - SupportsQuant) +from vllm.model_executor.models.interfaces import (HasInnerState, HasMamba2, + IsHybrid, SupportsLoRA, + SupportsPP, SupportsQuant) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.models.utils import ( @@ -446,7 +446,7 @@ def load_weights(self, weights: Iterable[tuple[str, class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, - IsHybrid, SupportsQuant): + IsHybrid, SupportsQuant, HasMamba2): packed_modules_mapping = { "qkv_proj": [ "q_proj", diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index aca3d84f0071..3f0df25dd52c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -25,8 +25,8 @@ from vllm.transformers_utils.dynamic_module import ( try_get_class_from_dynamic_module) -from .interfaces import (has_inner_state, has_noops, is_attention_free, - is_hybrid, supports_cross_encoding, +from .interfaces import (has_inner_state, has_mamba2, has_noops, + is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, supports_multimodal_raw_input, supports_pp, supports_transcription, supports_v0_only) from .interfaces_base import is_pooling_model, is_text_generation_model @@ -312,6 +312,7 @@ class _ModelInfo: has_inner_state: bool is_attention_free: bool is_hybrid: bool + has_mamba2: bool has_noops: bool supports_transcription: bool supports_transcription_only: bool @@ -329,6 +330,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), is_attention_free=is_attention_free(model), + has_mamba2=has_mamba2(model), is_hybrid=is_hybrid(model), supports_transcription=supports_transcription(model), supports_transcription_only=(supports_transcription(model) and @@ -760,6 +762,14 @@ def is_hybrid_model( model_cls, _ = self.inspect_model_cls(architectures, model_config) return model_cls.is_hybrid + def model_has_mamba2( + self, + architectures: Union[str, list[str]], + model_config: ModelConfig, + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures, model_config) + return model_cls.has_mamba2 + def is_noops_model( self, architectures: Union[str, list[str]], diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 4cb0becf302f..31141ecb7b38 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -44,7 +44,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid +from .interfaces import HasInnerState, HasMamba2, IsHybrid from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -832,7 +832,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): +class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, HasMamba2): """Zamba2 model with causal language modeling head. This class wraps the core Zamba2 model and adds: