diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index bac64eec8c55..d6955111797b 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -50,7 +50,6 @@ maybe_prefix, ) - class GraniteMoeHybridMambaDecoderLayer(nn.Module): def __init__( self, @@ -81,7 +80,7 @@ def __init__( model_config=model_config, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.mixer", + prefix=f"{prefix}.mamba", ) self.block_sparse_moe = None @@ -570,6 +569,7 @@ def _load_quant_expert(name, loaded_weight): shard_id="w2", expert_id=e, ) + elif n.endswith(".block_sparse_moe.router.layer.weight"): gate_name = n.replace( ".block_sparse_moe.router.layer.weight",