updatE

dsikka · dsikka · commit 15da5c69a5fa · 2025-09-30T20:15:06.000-04:00
diff --git a/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py b/examples/quantization_w8a8_fp8/qwen3_vl_moe_fp8_example.py
@@ -1,26 +1,29 @@
 
-from transformers import Qwen3VLMoeForConditionalGeneration
+from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modeling import replace_modules_for_calibration
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # NOTE: Qwen3-VL-MoE support is not in transformers<=4.56.2
-# you may need to install transformes from source
+# you may need to install transformers from source
 
-MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
 
+MODEL_ID = "Qwen/Qwen3-VL-235B-A22B-Instruct"
 
 # Load model.
 model = Qwen3VLMoeForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
+processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = replace_modules_for_calibration(model)
+
 # Configure the quantization algorithm and scheme.
 # In this case, we:
-#   * quantize the weights to fp8 with block size 128 via ptq
-#   * quantize the activations to fp8 with dynamic group activations
+#   * quantize the weights to fp8 with channel-wise quantization
+#   * quantize the activations to fp8 with dynamic token activations
+# NOTE: only datafree quantization is supported for Qwen3-VL-MoE currently
 recipe = QuantizationModifier(
     targets="Linear",
-    scheme="FP8_BLOCK",
+    scheme="FP8_DYNAMIC",
     ignore=[
         "re:.*lm_head",
         "re:visual.*",
@@ -33,5 +36,6 @@
 oneshot(model=model, recipe=recipe)
 
 # Save to disk in compressed-tensors format.
-SAVE_DIR = "/proving-grounds/engine/hub_cache/Qwen3-VL-235B-A22B-Instruct" + "-FP8-BLOCK"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-DYNAMIC"
 model.save_pretrained(SAVE_DIR)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modeling/qwen3_vl_moe.py b/src/llmcompressor/modeling/qwen3_vl_moe.py
@@ -11,7 +11,7 @@ def __init__(self, config, original):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_experts = config.num_experts
-        self.gate = original.gate
+        self.gate = wrap_gate(original.gate)
         self.experts = SequentialQwen3VLMoeTextExperts(config, original.experts)
 
 class SequentialQwen3VLMoeTextExperts(torch.nn.ModuleList):
@@ -33,7 +33,19 @@ def __init__(self, config, original):
             self[i].up_proj.weight.data = up_proj.t().clone().contiguous()
             self[i].down_proj.weight.data = down.t().clone().contiguous()
 
-def replace(config, module, calibrate_all_experts):
+
+def wrap_gate(gate):
+    # temporary workaround until ct supports ignores of Linear instances
+    linear_gate = torch.nn.Linear(gate.in_features, gate.out_features)
+    linear_gate.weight.data.copy_(gate.weight.data)
+    setattr(linear_gate, "hidden_size", gate.hidden_size)
+    setattr(linear_gate, "top_k", gate.top_k)
+    setattr(linear_gate, "forward", gate.forward)
+    del gate
+    return linear_gate
+
+
+def replace(config, module, calibrate_all_experts=False):
     return LinearQwen3VLMoeTextSparseMoeBlock(
         config=config.get_text_config(),
         original=module,