[CPU] Linearize gpt_oss model and add example to quantize it to w4a8 (vllm-project#2113)

isharif168 · dsikka · shanjiaz · web-flow · commit 6bb790539a3f · 2025-12-19T11:11:04.000-05:00
SUMMARY:
Add code to convert the gpt_oss model and then quantize model to w8a8

TEST PLAN:
Tested with gpt_oss model and run using vllm

Sample output

=== Prompt 0 ===
&lt;|start|&gt;system&lt;|message|&gt;You are ChatGPT, a large language model
trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-12-04

Reasoning: medium

Valid channels: analysis, commentary, final. Channel must be included
for every message.&lt;|end|&gt;&lt;|start|&gt;developer&lt;|message|&gt;# Instructions

You are a helpful assistant.

&lt;|end|&gt;&lt;|start|&gt;user&lt;|message|&gt;Give 3 reasons to use
AI.&lt;|end|&gt;&lt;|start|&gt;assistant

--- Candidate 0 ---
analysisWe need to answer with three reasons to use AI. Should be short.
Provide 3 reasons. Let's do it.assistantcommentaryHere are three reasons
to use AI:

Personalized learning – AI can adapt to your learning style and pace,
providing customized lessons that suit your learning style and progress.
Skill development – AI can help you develop skills that are in demand,
such as coding, data analysis, and data science.
Supportive design
We should provide a short answer.assistantfinalHere are three reasons to
use AI:

Personalized learning – AI adapts to your learning style and pace,
giving you personalized lessons that fit your needs.
Skill development – AI helps you learn and practice skills that are in
demand, like coding, data analysis, and data‑science skills.
Real‑world impact – AI helps you practice real-world skills in real
life, such as learning new skills and applying them in real life.
finish_reason: stop
num_tokens: 226

---------

Signed-off-by: Sharif Inamdar &lt;sharif.inamdar@arm.com&gt;
Co-authored-by: Dipika Sikka &lt;dipikasikka1@gmail.com&gt;
Co-authored-by: shanjiaz &lt;43143795+shanjiaz@users.noreply.github.com&gt;
diff --git a/examples/quantization_w4a8/gpt_oss_20b_example.py b/examples/quantization_w4a8/gpt_oss_20b_example.py
@@ -0,0 +1,79 @@
+import torch
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.quantization.quant_args import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modeling.gpt_oss import convert_model_for_quantization_gptoss
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+
+def main():
+    MODEL_ID = "openai/gpt-oss-20b"
+    BASE_NAME = MODEL_ID.rstrip("/").split("/")[-1]
+    OUTPUT_DIR = f"{BASE_NAME}-w4a8-channelwise"
+
+    print(f"[GPT-OSS] Loading model: {MODEL_ID}")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+    # ---- GPT-OSS MoE → linear experts conversion ----
+    print("[GPT-OSS] Converting fused MoE experts to LinearExperts for quantization...")
+    convert_model_for_quantization_gptoss(model)
+    print("[GPT-OSS] Conversion completed.")
+
+    # ---- Quantization config: W4A8 (int4 weights, int8 activations) ----
+
+    # Weights: 4-bit, channelwise, symmetric, static
+    weights_args = QuantizationArgs(
+        num_bits=4,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.CHANNEL,
+        symmetric=True,
+        dynamic=False,
+    )
+
+    # Activations: 8-bit, per-token, asymmetric, dynamic
+    activations_args = QuantizationArgs(
+        num_bits=8,
+        type=QuantizationType.INT,
+        strategy=QuantizationStrategy.TOKEN,
+        symmetric=False,
+        dynamic=True,
+        observer=None,
+    )
+
+    # Apply to all Linear layers, excluding lm_head
+    scheme = QuantizationScheme(
+        targets=["Linear"],
+        weights=weights_args,
+        input_activations=activations_args,
+    )
+
+    recipe = QuantizationModifier(
+        config_groups={"group_0": scheme},
+        ignore=["lm_head"],
+    )
+
+    print(f"[GPT-OSS] Starting oneshot quantization → {OUTPUT_DIR}")
+    oneshot(
+        model=model,
+        recipe=recipe,
+        tokenizer=tokenizer,
+        output_dir=OUTPUT_DIR,
+        trust_remote_code_model=True,
+    )
+    print(f"[GPT-OSS] Quantization finished. Quantized model written to: {OUTPUT_DIR}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/llmcompressor/modeling/gpt_oss.py b/src/llmcompressor/modeling/gpt_oss.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+
+class LinearExpert(nn.Module):
+    """
+    One MoE expert with separate gate / up / down projections.
+
+    This mirrors the GPT-OSS expert behavior:
+        gate = clamp(gate_proj(x))
+        up   = clamp(up_proj(x))
+        glu  = gate * sigmoid(alpha * gate)
+        y    = down_proj((up + 1) * glu)
+    """
+
+    def __init__(
+        self, hidden_size: int, intermediate_size: int, alpha: float, limit: float
+    ):
+        super().__init__()
+        self.alpha = alpha
+        self.limit = limit
+
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=True)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=True)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+
+        gate = gate.clamp(max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+
+        glu = gate * torch.sigmoid(self.alpha * gate)
+        act = (up + 1) * glu
+        return self.down_proj(act)
+
+
+class LinearExperts(nn.Module):
+    """
+    Container of multiple LinearExpert modules, driven by
+    router_indices / routing_weights.
+
+    This is the "separate gate/up" layout.
+    It is meant to replace the original GPT-OSS `experts` submodule.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_experts: int,
+        alpha: float = 1.702,
+        limit: float = 7.0,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.expert_dim = intermediate_size
+        self.num_experts = num_experts
+        self.alpha = alpha
+        self.limit = limit
+
+        self.experts = nn.ModuleList(
+            [
+                LinearExpert(hidden_size, intermediate_size, alpha, limit)
+                for _ in range(num_experts)
+            ]
+        )
+
+    @torch.no_grad()
+    def copy_from_fused_weights(
+        self,
+        legacy_gate_up_W: torch.Tensor,  # [E, H, 2D]
+        legacy_gate_up_b: torch.Tensor,  # [E, 2D]
+        legacy_down_W: torch.Tensor,  # [E, D, H]
+        legacy_down_b: torch.Tensor,  # [E, H]
+    ) -> None:
+        """
+        De-interleave fused gate_up weights/bias and copy into separate gate/up experts.
+        """
+        E, H, twoD = legacy_gate_up_W.shape
+        assert E == self.num_experts
+        D = twoD // 2
+        assert D == self.expert_dim
+
+        for i in range(E):
+            Wi = legacy_gate_up_W[i]  # [H, 2D]
+            bi = legacy_gate_up_b[i]  # [2D]
+
+            Wg = Wi[:, 0::2].contiguous()  # [H, D]
+            Wu = Wi[:, 1::2].contiguous()  # [H, D]
+            bg = bi[0::2].contiguous()  # [D]
+            bu = bi[1::2].contiguous()  # [D]
+
+            expert = self.experts[i]
+            expert.gate_proj.weight.copy_(Wg.t())
+            expert.gate_proj.bias.copy_(bg)
+            expert.up_proj.weight.copy_(Wu.t())
+            expert.up_proj.bias.copy_(bu)
+
+            expert.down_proj.weight.copy_(legacy_down_W[i].t())
+            expert.down_proj.bias.copy_(legacy_down_b[i])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # [B, T, H]
+        router_indices: Optional[
+            torch.Tensor
+        ] = None,  # [B, T, top_k] or [tokens, top_k]
+        routing_weights: Optional[torch.Tensor] = None,  # [B, T, E] or [tokens, E]
+    ) -> torch.Tensor:
+        """
+        Implements the MoE computation using the router outputs.
+
+        This is compatible with the GPT-OSS MoE call pattern:
+            experts(hidden_states, router_indices, routing_weights)
+        """
+        assert (
+            routing_weights is not None and router_indices is not None
+        ), "router inputs required"
+
+        # Normalize shapes to [tokens, H], [tokens, top_k], [tokens, E]
+        if hidden_states.dim() == 3:
+            B, T, H = hidden_states.shape
+            x = hidden_states.reshape(-1, H)
+        else:
+            # Already flattened
+            B, _ = 1, hidden_states.shape[0]
+            H = hidden_states.shape[-1]
+            x = hidden_states
+
+        if router_indices.dim() == 3:
+            router_indices = router_indices.reshape(-1, router_indices.shape[-1])
+        if routing_weights.dim() == 3:
+            routing_weights = routing_weights.reshape(-1, routing_weights.shape[-1])
+
+        num_experts_plus_dummy = routing_weights.shape[1]
+        out = torch.zeros_like(x)
+
+        # GPT-OSS router uses an extra "no expert" bucket at index E
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(
+                router_indices, num_classes=num_experts_plus_dummy
+            ).permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+
+        for idx in expert_hit:
+            e = idx[0].item()
+            if e == self.num_experts:
+                # Skip "no expert" bucket
+                continue
+
+            _, token_idx = torch.where(expert_mask[e])
+            xi = x[token_idx]
+
+            expert = self.experts[e]
+            yi = expert(xi)
+
+            w = routing_weights[token_idx, e, None]
+            out.index_add_(0, token_idx, (yi * w).to(out.dtype))
+
+        return out.view(B, -1, H)
+
+
+@dataclass
+class ExpertMeta:
+    path: str
+    hidden_size: int
+    intermediate_size: int
+    num_experts: int
+    device: torch.device
+    dtype: torch.dtype
+
+
+def get_module_by_path(root: nn.Module, dotpath: str) -> nn.Module:
+    m: nn.Module = root
+    if not dotpath:
+        return root
+    for p in dotpath.split("."):
+        m = getattr(m, p)
+    return m
+
+
+def set_module_by_path(root: nn.Module, dotpath: str, new_module: nn.Module) -> None:
+    parts = dotpath.split(".")
+    parent = get_module_by_path(root, ".".join(parts[:-1]))
+    setattr(parent, parts[-1], new_module)
+
+
+def find_experts(model: nn.Module) -> List[ExpertMeta]:
+    """
+    Locate GPT-OSS MoE expert modules under model.model.layers[*].mlp.experts.
+    """
+    metas: List[ExpertMeta] = []
+    for li, layer in enumerate(model.model.layers):
+        experts = layer.mlp.experts
+        device = next(experts.parameters(), torch.zeros(())).device
+        dtype = next(experts.parameters(), torch.zeros(())).dtype
+        intermediate = getattr(experts, "expert_dim", None)
+        if intermediate is None:
+            intermediate = getattr(experts, "intermediate_size")
+
+        metas.append(
+            ExpertMeta(
+                path=f"model.layers.{li}.mlp.experts",
+                hidden_size=experts.hidden_size,
+                intermediate_size=intermediate,
+                num_experts=experts.num_experts,
+                device=device,
+                dtype=dtype,
+            )
+        )
+    return metas
+
+
+def convert_model_for_quantization_gptoss(model: nn.Module) -> None:
+    """
+    In-place conversion of a GPT-OSS model:
+
+    - Finds all fused MoE expert blocks (with gate_up_proj/down_proj).
+    - Replaces them with LinearExperts that expose plain nn.Linear
+      parameters (gate_proj, up_proj, down_proj), which play nicely
+      with LLM Compressor W4A8 quantization.
+    """
+    metas = find_experts(model)
+    for meta in metas:
+        legacy = get_module_by_path(model, meta.path)
+
+        # Sanity check that this is the fused layout we expect.
+        if not all(
+            hasattr(legacy, attr)
+            for attr in [
+                "gate_up_proj",
+                "gate_up_proj_bias",
+                "down_proj",
+                "down_proj_bias",
+            ]
+        ):
+            continue
+
+        new_exp = LinearExperts(
+            hidden_size=meta.hidden_size,
+            intermediate_size=meta.intermediate_size,
+            num_experts=meta.num_experts,
+        ).to(device=meta.device, dtype=meta.dtype)
+
+        new_exp.copy_from_fused_weights(
+            legacy_gate_up_W=legacy.gate_up_proj,
+            legacy_gate_up_b=legacy.gate_up_proj_bias,
+            legacy_down_W=legacy.down_proj,
+            legacy_down_b=legacy.down_proj_bias,
+        )
+
+        set_module_by_path(model, meta.path, new_exp)