[Qwen3VLMoe] Add linearized definition and FP8 Quantization Example (#1874)

dsikka · cajeonrh · commit 5dd06ff04424 · 2025-10-02T13:16:27.000-04:00
SUMMARY:
- Updates the MoE layer to use a linearized definition such that we can
quantize and run the model in vLLM
- Wraps the gate layer so that it is properly ignored - this is hack for
now. We will need to do this properly in ct
- Not adding forward pass for now; will add a forward pass as a
follow-up but would like it in for the release to enable FP8
quantization
- Note - requires latest transformers

TEST PLAN:
- Produces
`/proving-grounds/engine/hub_cache/Qwen3-VL-235B-A22B-Instruct-FP8_DYNAMIC`
which generates coherent generations:

```python
if __name__ == '__main__':
    import torch

    from vllm import LLM, SamplingParams
    import torch

    prompts = [
        "The Swiss Alps are",
        "Brad Marchand is",
        "The Toronto Maple Leafs are"
    ]

    # Create a sampling params object for greedy sampling
    sampling_params = SamplingParams(temperature=0.80, top_p=0.95, max_tokens=40, min_tokens=10)
    llm = LLM("/proving-grounds/engine/hub_cache/Qwen3-VL-235B-A22B-Instruct-FP8_DYNAMIC", tensor_parallel_size=2, max_model_len=4096, enforce_eager=True)
    output = llm.generate(prompts, sampling_params)
    for out in output:
        print(out.outputs[0].text)

```

Generations:
```bash
 a true paradise for nature lovers and outdoor enthusiasts. With their snow-capped peaks, lush green valleys, and crystal-clear lakes, the Alps offer a stunning backdrop for a wide range of activities. Whether
 a prominent figure in the NHL, known for his exceptional performance and leadership. He has won the Art Ross Trophy as the NHL's leading scorer, with 110 points (32 goals and
 a professional ice hockey team based in Toronto, Ontario, Canada. They are members of the Atlantic Division in the Eastern Conference of the National Hockey League (NHL). The team was established in 1

```

Signed-off-by: Cassie Jeon &lt;cajeon@redhat.com&gt;
diff --git a/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py b/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py
@@ -0,0 +1,40 @@
+from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+from llmcompressor import oneshot
+from llmcompressor.modeling import replace_modules_for_calibration
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# NOTE: Qwen3-VL-MoE support is not in transformers<=4.56.2
+# you may need to install transformers from source
+
+
+MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
+
+# Load model.
+model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = replace_modules_for_calibration(model)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with channel-wise quantization
+#   * quantize the activations to fp8 with dynamic token activations
+# NOTE: only datafree quantization is supported for Qwen3-VL-MoE currently
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="FP8_DYNAMIC",
+    ignore=[
+        "re:.*lm_head",
+        "re:visual.*",
+        "re:model.visual.*",
+        "re:.*mlp.gate$",
+    ],
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-DYNAMIC"
+model.save_pretrained(SAVE_DIR)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -5,11 +5,8 @@
 from llmcompressor.modeling.deepseek_v3 import replace as replace_deepseekv3
 from llmcompressor.modeling.llama4 import replace as replace_llama4
 from llmcompressor.modeling.qwen3_moe import replace as replace_Qwen3MoE
-
-try:
-    from llmcompressor.modeling.qwen3_next_moe import replace as replace_Qwen3NextMoE
-except ImportError:
-    replace_Qwen3NextMoE = None
+from llmcompressor.modeling.qwen3_next_moe import replace as replace_Qwen3NextMoE
+from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
 from llmcompressor.utils.helpers import patch_attr
 
 __all__ = ["replace_modules_for_calibration"]
@@ -18,6 +15,7 @@
 replacements = {
     "DeepseekV3MoE": replace_deepseekv3,
     "Llama4TextMoe": replace_llama4,
+    "Qwen3VLMoeTextSparseMoeBlock": replace_Qwen3VLMoE,
 }
 
 
@@ -81,11 +79,9 @@ def update_qwen3_next_moe(model, module, stack, calibrate_all_experts):
 
 moe_context = {
     "Qwen3MoeForCausalLM": update_qwen3_moe,
+    "Qwen3NextForCausalLM": update_qwen3_next_moe,
 }
 
-if replace_Qwen3NextMoE is not None:
-    moe_context["Qwen3NextForCausalLM"] = update_qwen3_next_moe
-
 
 def moe_calibration_context(
     model: PreTrainedModel,
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py
@@ -15,17 +15,13 @@
 # limitations under the License.
 
 import torch
-from transformers.models import Qwen3NextConfig
-from transformers.models.qwen3_next.modeling_qwen3_next import (
-    Qwen3NextSparseMoeBlock as OriginalQwen3NextMoeSparseMoeBlock,
-)
 
 
 class Qwen3NextSparseMoeBlock(torch.nn.Module):
     def __init__(
         self,
-        config: Qwen3NextConfig,
-        original: OriginalQwen3NextMoeSparseMoeBlock,
+        config,
+        original,
         calibrate_all_experts: bool,
     ):
         super().__init__()
@@ -109,9 +105,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 def replace(
-    config: Qwen3NextConfig,
-    module: OriginalQwen3NextMoeSparseMoeBlock,
-    calibrate_all_experts: bool,
+    config,
+    module,
+    calibrate_all_experts,
 ):
     return Qwen3NextSparseMoeBlock(
         config=config, original=module, calibrate_all_experts=calibrate_all_experts
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -0,0 +1,56 @@
+import torch
+
+from llmcompressor.utils.dev import skip_weights_initialize
+
+
+class LinearQwen3VLMoeTextSparseMoeBlock(torch.nn.Module):
+    def __init__(self, config, original):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_experts = config.num_experts
+        self.gate = wrap_gate(original.gate)
+        self.experts = SequentialQwen3VLMoeTextExperts(config, original.experts)
+
+
+class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList):
+    def __init__(self, config, original):
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+            Qwen3VLMoeTextMLP,
+        )
+
+        self.num_experts = original.gate_up_proj.shape[0]
+        with skip_weights_initialize():
+            super().__init__(
+                [Qwen3VLMoeTextMLP(config) for _ in range(self.num_experts)]
+            )
+
+        intermediate_size = original.down_proj.shape[1]
+
+        for i in range(self.num_experts):
+            gate_up = original.gate_up_proj[i]
+            down = original.down_proj[i]
+
+            gate_proj = gate_up[:, :intermediate_size]
+            up_proj = gate_up[:, intermediate_size:]
+
+            self[i].gate_proj.weight.data = gate_proj.t().clone().contiguous()
+            self[i].up_proj.weight.data = up_proj.t().clone().contiguous()
+            self[i].down_proj.weight.data = down.t().clone().contiguous()
+
+
+def wrap_gate(gate):
+    # temporary workaround until ct supports ignores of Linear instances
+    linear_gate = torch.nn.Linear(gate.in_features, gate.out_features)
+    linear_gate.weight.data.copy_(gate.weight.data)
+    setattr(linear_gate, "hidden_size", gate.hidden_size)
+    setattr(linear_gate, "top_k", gate.top_k)
+    setattr(linear_gate, "forward", gate.forward)
+    del gate
+    return linear_gate
+
+
+def replace(config, module, calibrate_all_experts=False):
+    return LinearQwen3VLMoeTextSparseMoeBlock(
+        config=config.get_text_config(),
+        original=module,
+    )