[Model] Granite-4 support loading quantized checkpoint (vllm-project#22925)

cyang49 · dumb0002 · commit fe07405f8ba7 · 2025-08-28T17:26:26.000-04:00
Signed-off-by: Chih-Chieh-Yang &lt;7364402+cyang49@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
@@ -471,7 +471,10 @@ def _load_expert(n, p, name, shard_id, expert_id):
             # Mapping different experts' layout:
             #  from HF (input_linear, output_linear, router)
             #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            # The renaming and parameter loading logic is the same for weight
+            # and weight_scale tensors so we can reuse them without issues.
+            if (n.endswith('.block_sparse_moe.input_linear.weight') or
+                    n.endswith('.block_sparse_moe.input_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
@@ -490,7 +493,8 @@ def _load_expert(n, p, name, shard_id, expert_id):
                                  w3_name,
                                  shard_id='w3',
                                  expert_id=e)
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif (n.endswith('.block_sparse_moe.output_linear.weight') or
+                  n.endswith('.block_sparse_moe.output_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',