From 0d11efc2cd9b54c3092a7d950357fe63fdb421ed Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 10 Nov 2025 08:17:04 -0500
Subject: [PATCH 1/7] Granite4 Add Quant Config

---
 .../model_executor/models/granitemoehybrid.py | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index bac64eec8c55..7fc8bc63b46d 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -652,11 +652,62 @@ def get_mamba_state_shape_from_config(
             conv_kernel=hf_config.mamba_d_conv,
         )
 
+    def maybe_update_quant_config(
+            self, quant_config: QuantizationConfig
+        ) -> QuantizationConfig:
+            """
+            Update quant config so that ignored module and target module names
+            match the vLLM model names.
+            Granite model specific: mamba -> mixer remapping.
+            """
+            remapping_rules = [
+                # Granite model: mamba -> mixer remapping
+                (
+                    r"model\.layers\.(\d+)\.mamba\.in_proj",
+                    r"model.layers.\1.mixer.in_proj",
+                ),
+                (
+                    r"model\.layers\.(\d+)\.mamba\.out_proj",
+                    r"model.layers.\1.mixer.out_proj",
+                ),
+            ]
+            # Update ignore list
+            if hasattr(quant_config, "ignore"):
+                updated_ignore = []
+                for name in quant_config.ignore:
+                    updated_name = name
+                    for pattern, repl in remapping_rules:
+                        if re.fullmatch(pattern, name):
+                            updated_name = re.sub(pattern, repl, name)
+                    updated_ignore.append(updated_name)
+                quant_config.ignore = updated_ignore
+            # Update target list
+            if hasattr(quant_config, "config_groups"):
+                config_groups = quant_config.config_groups
+                for group_name in config_groups:
+                    if "targets" in config_groups[group_name]:
+                        targets = []
+                        for name in config_groups[group_name]["targets"]:
+                            updated_name = name
+                            for pattern, repl in remapping_rules:
+                                if re.fullmatch(pattern, name):
+                                    updated_name = re.sub(pattern, repl, name)
+                            targets.append(updated_name)
+                    config_groups[group_name]["targets"] = targets
+                quant_config.config_groups = config_groups
+            return quant_config
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
+
+        if hasattr(vllm_config, "quant_config"):
+            vllm_config.quant_config = self.maybe_update_quant_config(
+                vllm_config.quant_config
+            )
+
         self.model_config = vllm_config.model_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config

From 375c61e9c5dd332bdb9a12bf4e651ce98ef9c74e Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 10 Nov 2025 08:31:07 -0500
Subject: [PATCH 2/7] import re

---
 vllm/model_executor/models/granitemoehybrid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 7fc8bc63b46d..a6ee904c5eef 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -49,7 +49,7 @@
     make_layers,
     maybe_prefix,
 )
-
+import re
 
 class GraniteMoeHybridMambaDecoderLayer(nn.Module):
     def __init__(

From 38a677b85eedbf77dd123e3601e79aa206796097 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 10 Nov 2025 09:47:28 -0500
Subject: [PATCH 3/7] minor indentation fix

---
 .../model_executor/models/granitemoehybrid.py | 87 +++++++++----------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index a6ee904c5eef..ebc544fa959b 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -652,50 +652,49 @@ def get_mamba_state_shape_from_config(
             conv_kernel=hf_config.mamba_d_conv,
         )
 
-    def maybe_update_quant_config(
-            self, quant_config: QuantizationConfig
-        ) -> QuantizationConfig:
-            """
-            Update quant config so that ignored module and target module names
-            match the vLLM model names.
-            Granite model specific: mamba -> mixer remapping.
-            """
-            remapping_rules = [
-                # Granite model: mamba -> mixer remapping
-                (
-                    r"model\.layers\.(\d+)\.mamba\.in_proj",
-                    r"model.layers.\1.mixer.in_proj",
-                ),
-                (
-                    r"model\.layers\.(\d+)\.mamba\.out_proj",
-                    r"model.layers.\1.mixer.out_proj",
-                ),
-            ]
-            # Update ignore list
-            if hasattr(quant_config, "ignore"):
-                updated_ignore = []
-                for name in quant_config.ignore:
-                    updated_name = name
-                    for pattern, repl in remapping_rules:
-                        if re.fullmatch(pattern, name):
-                            updated_name = re.sub(pattern, repl, name)
-                    updated_ignore.append(updated_name)
-                quant_config.ignore = updated_ignore
-            # Update target list
-            if hasattr(quant_config, "config_groups"):
-                config_groups = quant_config.config_groups
-                for group_name in config_groups:
-                    if "targets" in config_groups[group_name]:
-                        targets = []
-                        for name in config_groups[group_name]["targets"]:
-                            updated_name = name
-                            for pattern, repl in remapping_rules:
-                                if re.fullmatch(pattern, name):
-                                    updated_name = re.sub(pattern, repl, name)
-                            targets.append(updated_name)
-                    config_groups[group_name]["targets"] = targets
-                quant_config.config_groups = config_groups
-            return quant_config
+    def maybe_update_quant_config(self, quant_config: QuantizationConfig) -> QuantizationConfig:
+        """
+        Update quant config so that ignored module and target module names
+        match the vLLM model names.
+        Granite model specific: mamba -> mixer remapping.
+        """
+        remapping_rules = [
+            # Granite model: mamba -> mixer remapping
+            (
+                r"model\.layers\.(\d+)\.mamba\.in_proj",
+                r"model.layers.\1.mixer.in_proj",
+            ),
+            (
+                r"model\.layers\.(\d+)\.mamba\.out_proj",
+                r"model.layers.\1.mixer.out_proj",
+            ),
+        ]
+        # Update ignore list
+        if hasattr(quant_config, "ignore"):
+            updated_ignore = []
+            for name in quant_config.ignore:
+                updated_name = name
+                for pattern, repl in remapping_rules:
+                    if re.fullmatch(pattern, name):
+                        updated_name = re.sub(pattern, repl, name)
+                updated_ignore.append(updated_name)
+            quant_config.ignore = updated_ignore
+        # Update target list
+        if hasattr(quant_config, "config_groups"):
+            config_groups = quant_config.config_groups
+            for group_name in config_groups:
+                if "targets" in config_groups[group_name]:
+                    targets = []
+                    for name in config_groups[group_name]["targets"]:
+                        updated_name = name
+                        for pattern, repl in remapping_rules:
+                            if re.fullmatch(pattern, name):
+                                updated_name = re.sub(pattern, repl, name)
+                        targets.append(updated_name)
+                config_groups[group_name]["targets"] = targets
+            quant_config.config_groups = config_groups
+        return quant_config
+
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From c7f5534ecb009dfb78b51d7ffee78fe12b61e398 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Thu, 13 Nov 2025 15:48:25 -0500
Subject: [PATCH 4/7] Fix Granite4 MoE weight loading function

---
 .../model_executor/models/granitemoehybrid.py | 93 ++++++++++---------
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index ebc544fa959b..399ff934a4d6 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -81,7 +81,7 @@ def __init__(
             model_config=model_config,
             cache_config=cache_config,
             quant_config=quant_config,
-            prefix=f"{prefix}.mixer",
+            prefix=f"{prefix}.mamba",
         )
 
         self.block_sparse_moe = None
@@ -570,6 +570,53 @@ def _load_quant_expert(name, loaded_weight):
                         shard_id="w2",
                         expert_id=e,
                     )
+            
+            elif ('.block_sparse_moe.output_linear.experts.' in n and 
+                (n.endswith('.weight') or n.endswith('.weight_scale'))):
+                
+                # Extract expert ID from the parameter name
+                expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None
+                
+                # Generate the target w2 name
+                w2_name = n.replace(
+                    f'.block_sparse_moe.output_linear.experts.{expert_idx}.weight',
+                    f".block_sparse_moe.experts.{expert_idx}.w2.weight")
+                
+                w2_param = p
+                _load_expert(n.replace(f'.output_linear.experts.{expert_idx}.', '.experts.w2_'),
+                            w2_param,
+                            w2_name,
+                            shard_id='w2',
+                            expert_id=expert_idx)
+
+            elif ('.block_sparse_moe.input_linear.experts.' in n and 
+                (n.endswith('.weight') or n.endswith('.weight_scale'))):
+
+                # Extract expert ID from the parameter name
+                expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None
+                
+                # Generate the target w1 and w3 names
+                w1_name = n.replace(
+                    f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight',
+                    f".block_sparse_moe.experts.{expert_idx}.w1.weight")
+                w3_name = n.replace(
+                    f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight',
+                    f".block_sparse_moe.experts.{expert_idx}.w3.weight")
+                
+                # Split the parameter into w1 and w3
+                w1_param, w3_param = p.chunk(2, dim=0)
+
+                _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'),
+                            w1_param,
+                            w1_name,
+                            shard_id='w1',
+                            expert_id=expert_idx)
+                _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'),
+                            w3_param,
+                            w3_name,
+                            shard_id='w3',
+                            expert_id=expert_idx)
+
             elif n.endswith(".block_sparse_moe.router.layer.weight"):
                 gate_name = n.replace(
                     ".block_sparse_moe.router.layer.weight",
@@ -652,50 +699,6 @@ def get_mamba_state_shape_from_config(
             conv_kernel=hf_config.mamba_d_conv,
         )
 
-    def maybe_update_quant_config(self, quant_config: QuantizationConfig) -> QuantizationConfig:
-        """
-        Update quant config so that ignored module and target module names
-        match the vLLM model names.
-        Granite model specific: mamba -> mixer remapping.
-        """
-        remapping_rules = [
-            # Granite model: mamba -> mixer remapping
-            (
-                r"model\.layers\.(\d+)\.mamba\.in_proj",
-                r"model.layers.\1.mixer.in_proj",
-            ),
-            (
-                r"model\.layers\.(\d+)\.mamba\.out_proj",
-                r"model.layers.\1.mixer.out_proj",
-            ),
-        ]
-        # Update ignore list
-        if hasattr(quant_config, "ignore"):
-            updated_ignore = []
-            for name in quant_config.ignore:
-                updated_name = name
-                for pattern, repl in remapping_rules:
-                    if re.fullmatch(pattern, name):
-                        updated_name = re.sub(pattern, repl, name)
-                updated_ignore.append(updated_name)
-            quant_config.ignore = updated_ignore
-        # Update target list
-        if hasattr(quant_config, "config_groups"):
-            config_groups = quant_config.config_groups
-            for group_name in config_groups:
-                if "targets" in config_groups[group_name]:
-                    targets = []
-                    for name in config_groups[group_name]["targets"]:
-                        updated_name = name
-                        for pattern, repl in remapping_rules:
-                            if re.fullmatch(pattern, name):
-                                updated_name = re.sub(pattern, repl, name)
-                        targets.append(updated_name)
-                config_groups[group_name]["targets"] = targets
-            quant_config.config_groups = config_groups
-        return quant_config
-
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 

From 0c5b1ce12653bd02e39e777ff65e7b2cc6146a50 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Thu, 13 Nov 2025 15:50:24 -0500
Subject: [PATCH 5/7] remove import re

---
 vllm/model_executor/models/granitemoehybrid.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 399ff934a4d6..55d64c3cc9bc 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -49,7 +49,6 @@
     make_layers,
     maybe_prefix,
 )
-import re
 
 class GraniteMoeHybridMambaDecoderLayer(nn.Module):
     def __init__(

From e0cb248884a5c2a271c2f9042148065745d4cb14 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 17 Nov 2025 11:46:22 -0500
Subject: [PATCH 6/7] minor fix

---
 vllm/model_executor/models/granitemoehybrid.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 55d64c3cc9bc..6e49a24592df 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -703,12 +703,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         config = vllm_config.model_config.hf_config
         self.vllm_config = vllm_config
-
-        if hasattr(vllm_config, "quant_config"):
-            vllm_config.quant_config = self.maybe_update_quant_config(
-                vllm_config.quant_config
-            )
-
         self.model_config = vllm_config.model_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config

From 4825e5084325b406198e718f6e00f77ca1e63953 Mon Sep 17 00:00:00 2001
From: krishnateja95 <krishnateja95@outlook.com>
Date: Mon, 1 Dec 2025 10:59:19 -0500
Subject: [PATCH 7/7] Update weight loading

---
 .../model_executor/models/granitemoehybrid.py | 46 -------------------
 1 file changed, 46 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 6e49a24592df..d6955111797b 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -569,52 +569,6 @@ def _load_quant_expert(name, loaded_weight):
                         shard_id="w2",
                         expert_id=e,
                     )
-            
-            elif ('.block_sparse_moe.output_linear.experts.' in n and 
-                (n.endswith('.weight') or n.endswith('.weight_scale'))):
-                
-                # Extract expert ID from the parameter name
-                expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None
-                
-                # Generate the target w2 name
-                w2_name = n.replace(
-                    f'.block_sparse_moe.output_linear.experts.{expert_idx}.weight',
-                    f".block_sparse_moe.experts.{expert_idx}.w2.weight")
-                
-                w2_param = p
-                _load_expert(n.replace(f'.output_linear.experts.{expert_idx}.', '.experts.w2_'),
-                            w2_param,
-                            w2_name,
-                            shard_id='w2',
-                            expert_id=expert_idx)
-
-            elif ('.block_sparse_moe.input_linear.experts.' in n and 
-                (n.endswith('.weight') or n.endswith('.weight_scale'))):
-
-                # Extract expert ID from the parameter name
-                expert_idx = int(n.split('.experts.')[1].split('.')[0]) if '.experts.' in n else None
-                
-                # Generate the target w1 and w3 names
-                w1_name = n.replace(
-                    f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight',
-                    f".block_sparse_moe.experts.{expert_idx}.w1.weight")
-                w3_name = n.replace(
-                    f'.block_sparse_moe.input_linear.experts.{expert_idx}.weight',
-                    f".block_sparse_moe.experts.{expert_idx}.w3.weight")
-                
-                # Split the parameter into w1 and w3
-                w1_param, w3_param = p.chunk(2, dim=0)
-
-                _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'),
-                            w1_param,
-                            w1_name,
-                            shard_id='w1',
-                            expert_id=expert_idx)
-                _load_expert(n.replace(f'.input_linear.experts.{expert_idx}.', '.experts.w13_'),
-                            w3_param,
-                            w3_name,
-                            shard_id='w3',
-                            expert_id=expert_idx)
 
             elif n.endswith(".block_sparse_moe.router.layer.weight"):
                 gate_name = n.replace(