From 0d11efc2cd9b54c3092a7d950357fe63fdb421ed Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Mon, 10 Nov 2025 08:17:04 -0500 Subject: [PATCH 1/7] Granite4 Add Quant Config --- .../model_executor/models/granitemoehybrid.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index bac64eec8c55..7fc8bc63b46d 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -652,11 +652,62 @@ def get_mamba_state_shape_from_config( conv_kernel=hf_config.mamba_d_conv, ) + def maybe_update_quant_config( + self, quant_config: QuantizationConfig + ) -> QuantizationConfig: + """ + Update quant config so that ignored module and target module names + match the vLLM model names. + Granite model specific: mamba -> mixer remapping. + """ + remapping_rules = [ + # Granite model: mamba -> mixer remapping + ( + r"model\.layers\.(\d+)\.mamba\.in_proj", + r"model.layers.\1.mixer.in_proj", + ), + ( + r"model\.layers\.(\d+)\.mamba\.out_proj", + r"model.layers.\1.mixer.out_proj", + ), + ] + # Update ignore list + if hasattr(quant_config, "ignore"): + updated_ignore = [] + for name in quant_config.ignore: + updated_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + updated_name = re.sub(pattern, repl, name) + updated_ignore.append(updated_name) + quant_config.ignore = updated_ignore + # Update target list + if hasattr(quant_config, "config_groups"): + config_groups = quant_config.config_groups + for group_name in config_groups: + if "targets" in config_groups[group_name]: + targets = [] + for name in config_groups[group_name]["targets"]: + updated_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + updated_name = re.sub(pattern, repl, name) + targets.append(updated_name) + config_groups[group_name]["targets"] = targets + quant_config.config_groups = config_groups + return quant_config + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config self.vllm_config = vllm_config + + if hasattr(vllm_config, "quant_config"): + vllm_config.quant_config = self.maybe_update_quant_config( + vllm_config.quant_config + ) + self.model_config = vllm_config.model_config lora_config = vllm_config.lora_config scheduler_config = vllm_config.scheduler_config From 375c61e9c5dd332bdb9a12bf4e651ce98ef9c74e Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Mon, 10 Nov 2025 08:31:07 -0500 Subject: [PATCH 2/7] import re --- vllm/model_executor/models/granitemoehybrid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 7fc8bc63b46d..a6ee904c5eef 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -49,7 +49,7 @@ make_layers, maybe_prefix, ) - +import re class GraniteMoeHybridMambaDecoderLayer(nn.Module): def __init__( From 38a677b85eedbf77dd123e3601e79aa206796097 Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Mon, 10 Nov 2025 09:47:28 -0500 Subject: [PATCH 3/7] minor indentation fix --- .../model_executor/models/granitemoehybrid.py | 87 +++++++++---------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index a6ee904c5eef..ebc544fa959b 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -652,50 +652,49 @@ def get_mamba_state_shape_from_config( conv_kernel=hf_config.mamba_d_conv, ) - def maybe_update_quant_config( - self, quant_config: QuantizationConfig - ) -> QuantizationConfig: - """ - Update quant config so that ignored module and target module names - match the vLLM model names. - Granite model specific: mamba -> mixer remapping. - """ - remapping_rules = [ - # Granite model: mamba -> mixer remapping - ( - r"model\.layers\.(\d+)\.mamba\.in_proj", - r"model.layers.\1.mixer.in_proj", - ), - ( - r"model\.layers\.(\d+)\.mamba\.out_proj", - r"model.layers.\1.mixer.out_proj", - ), - ] - # Update ignore list - if hasattr(quant_config, "ignore"): - updated_ignore = [] - for name in quant_config.ignore: - updated_name = name - for pattern, repl in remapping_rules: - if re.fullmatch(pattern, name): - updated_name = re.sub(pattern, repl, name) - updated_ignore.append(updated_name) - quant_config.ignore = updated_ignore - # Update target list - if hasattr(quant_config, "config_groups"): - config_groups = quant_config.config_groups - for group_name in config_groups: - if "targets" in config_groups[group_name]: - targets = [] - for name in config_groups[group_name]["targets"]: - updated_name = name - for pattern, repl in remapping_rules: - if re.fullmatch(pattern, name): - updated_name = re.sub(pattern, repl, name) - targets.append(updated_name) - config_groups[group_name]["targets"] = targets - quant_config.config_groups = config_groups - return quant_config + def maybe_update_quant_config(self, quant_config: QuantizationConfig) -> QuantizationConfig: + """ + Update quant config so that ignored module and target module names + match the vLLM model names. + Granite model specific: mamba -> mixer remapping. + """ + remapping_rules = [ + # Granite model: mamba -> mixer remapping + ( + r"model\.layers\.(\d+)\.mamba\.in_proj", + r"model.layers.\1.mixer.in_proj", + ), + ( + r"model\.layers\.(\d+)\.mamba\.out_proj", + r"model.layers.\1.mixer.out_proj", + ), + ] + # Update ignore list + if hasattr(quant_config, "ignore"): + updated_ignore = [] + for name in quant_config.ignore: + updated_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + updated_name = re.sub(pattern, repl, name) + updated_ignore.append(updated_name) + quant_config.ignore = updated_ignore + # Update target list + if hasattr(quant_config, "config_groups"): + config_groups = quant_config.config_groups + for group_name in config_groups: + if "targets" in config_groups[group_name]: + targets = [] + for name in config_groups[group_name]["targets"]: + updated_name = name + for pattern, repl in remapping_rules: + if re.fullmatch(pattern, name): + updated_name = re.sub(pattern, repl, name) + targets.append(updated_name) + config_groups[group_name]["targets"] = targets + quant_config.config_groups = config_groups + return quant_config + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From c7f5534ecb009dfb78b51d7ffee78fe12b61e398 Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Thu, 13 Nov 2025 15:48:25 -0500 Subject: [PATCH 4/7] Fix Granite4 MoE weight loading function --- .../model_executor/models/granitemoehybrid.py | 93 ++++++++++--------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index ebc544fa959b..399ff934a4d6 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -81,7 +81,7 @@ def __init__( model_config=model_config, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.mixer", + prefix=f"{prefix}.mamba", ) self.block_sparse_moe = None @@ -570,6 +570,53 @@ def _load_quant_expert(name, loaded_weight): shard_id="w2", expert_id=e, ) + + elif ('.block_sparse_moe.output_linear.experts.' in n and + (n.endswith('.weight') or n.endswith('.weight_scale'))): + + # Extract expert ID from the parameter name + expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None + + # Generate the target w2 name + w2_name = n.replace( + f'.block_sparse_moe.output_linear.experts.{expert_idx}.weight', + f".block_sparse_moe.experts.{expert_idx}.w2.weight") + + w2_param = p + _load_expert(n.replace(f'.output_linear.experts.{expert_idx}.', '.experts.w2_'), + w2_param, + w2_name, + shard_id='w2', + expert_id=expert_idx) + + elif ('.block_sparse_moe.input_linear.experts.' in n and + (n.endswith('.weight') or n.endswith('.weight_scale'))): + + # Extract expert ID from the parameter name + expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None + + # Generate the target w1 and w3 names + w1_name = n.replace( + f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight', + f".block_sparse_moe.experts.{expert_idx}.w1.weight") + w3_name = n.replace( + f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight', + f".block_sparse_moe.experts.{expert_idx}.w3.weight") + + # Split the parameter into w1 and w3 + w1_param, w3_param = p.chunk(2, dim=0) + + _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'), + w1_param, + w1_name, + shard_id='w1', + expert_id=expert_idx) + _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'), + w3_param, + w3_name, + shard_id='w3', + expert_id=expert_idx) + elif n.endswith(".block_sparse_moe.router.layer.weight"): gate_name = n.replace( ".block_sparse_moe.router.layer.weight", @@ -652,50 +699,6 @@ def get_mamba_state_shape_from_config( conv_kernel=hf_config.mamba_d_conv, ) - def maybe_update_quant_config(self, quant_config: QuantizationConfig) -> QuantizationConfig: - """ - Update quant config so that ignored module and target module names - match the vLLM model names. - Granite model specific: mamba -> mixer remapping. - """ - remapping_rules = [ - # Granite model: mamba -> mixer remapping - ( - r"model\.layers\.(\d+)\.mamba\.in_proj", - r"model.layers.\1.mixer.in_proj", - ), - ( - r"model\.layers\.(\d+)\.mamba\.out_proj", - r"model.layers.\1.mixer.out_proj", - ), - ] - # Update ignore list - if hasattr(quant_config, "ignore"): - updated_ignore = [] - for name in quant_config.ignore: - updated_name = name - for pattern, repl in remapping_rules: - if re.fullmatch(pattern, name): - updated_name = re.sub(pattern, repl, name) - updated_ignore.append(updated_name) - quant_config.ignore = updated_ignore - # Update target list - if hasattr(quant_config, "config_groups"): - config_groups = quant_config.config_groups - for group_name in config_groups: - if "targets" in config_groups[group_name]: - targets = [] - for name in config_groups[group_name]["targets"]: - updated_name = name - for pattern, repl in remapping_rules: - if re.fullmatch(pattern, name): - updated_name = re.sub(pattern, repl, name) - targets.append(updated_name) - config_groups[group_name]["targets"] = targets - quant_config.config_groups = config_groups - return quant_config - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() From 0c5b1ce12653bd02e39e777ff65e7b2cc6146a50 Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Thu, 13 Nov 2025 15:50:24 -0500 Subject: [PATCH 5/7] remove import re --- vllm/model_executor/models/granitemoehybrid.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 399ff934a4d6..55d64c3cc9bc 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -49,7 +49,6 @@ make_layers, maybe_prefix, ) -import re class GraniteMoeHybridMambaDecoderLayer(nn.Module): def __init__( From e0cb248884a5c2a271c2f9042148065745d4cb14 Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Mon, 17 Nov 2025 11:46:22 -0500 Subject: [PATCH 6/7] minor fix --- vllm/model_executor/models/granitemoehybrid.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 55d64c3cc9bc..6e49a24592df 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -703,12 +703,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config self.vllm_config = vllm_config - - if hasattr(vllm_config, "quant_config"): - vllm_config.quant_config = self.maybe_update_quant_config( - vllm_config.quant_config - ) - self.model_config = vllm_config.model_config lora_config = vllm_config.lora_config scheduler_config = vllm_config.scheduler_config From 4825e5084325b406198e718f6e00f77ca1e63953 Mon Sep 17 00:00:00 2001 From: krishnateja95 Date: Mon, 1 Dec 2025 10:59:19 -0500 Subject: [PATCH 7/7] Update weight loading --- .../model_executor/models/granitemoehybrid.py | 46 ------------------- 1 file changed, 46 deletions(-) diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 6e49a24592df..d6955111797b 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -569,52 +569,6 @@ def _load_quant_expert(name, loaded_weight): shard_id="w2", expert_id=e, ) - - elif ('.block_sparse_moe.output_linear.experts.' in n and - (n.endswith('.weight') or n.endswith('.weight_scale'))): - - # Extract expert ID from the parameter name - expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None - - # Generate the target w2 name - w2_name = n.replace( - f'.block_sparse_moe.output_linear.experts.{expert_idx}.weight', - f".block_sparse_moe.experts.{expert_idx}.w2.weight") - - w2_param = p - _load_expert(n.replace(f'.output_linear.experts.{expert_idx}.', '.experts.w2_'), - w2_param, - w2_name, - shard_id='w2', - expert_id=expert_idx) - - elif ('.block_sparse_moe.input_linear.experts.' in n and - (n.endswith('.weight') or n.endswith('.weight_scale'))): - - # Extract expert ID from the parameter name - expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None - - # Generate the target w1 and w3 names - w1_name = n.replace( - f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight', - f".block_sparse_moe.experts.{expert_idx}.w1.weight") - w3_name = n.replace( - f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight', - f".block_sparse_moe.experts.{expert_idx}.w3.weight") - - # Split the parameter into w1 and w3 - w1_param, w3_param = p.chunk(2, dim=0) - - _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'), - w1_param, - w1_name, - shard_id='w1', - expert_id=expert_idx) - _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'), - w3_param, - w3_name, - shard_id='w3', - expert_id=expert_idx) elif n.endswith(".block_sparse_moe.router.layer.weight"): gate_name = n.replace(