[Autoround] Add MXFP8 example for Qwen3-8B (#2534)

yiliu30 · dsikka · web-flow · commit 31e585cd29a9 · 2026-03-30T16:50:37.000-04:00
## Summary - Adds an experimental example script demonstrating MXFP8 quantization of Qwen3-8B using `AutoRoundModifier` - Includes sample generation verification and saving in compressed format ```bash ========== SAMPLE GENERATION ============== Hello my name is Mandy and I am 25 years old. I live in a small village called Tynedale in the north of England. I have a small family, my parents and my younger brother. I work as a teacher in a local school. I love my job because I enjoy working with children. I have a dog called Lucy. She is a golden retriever and she is very friendly and smart. I also have a cat called Charlie. He is a black cat and he is very ========================================== ``` cc @hshen14 @thuang6 --------- Signed-off-by: yiliu30 <yi4.liu@intel.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/experimental/mxfp8/autoround_qwen3_example.py b/experimental/mxfp8/autoround_qwen3_example.py
@@ -0,0 +1,59 @@
+from pathlib import Path
+
+from auto_round.calib_dataset import get_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.autoround import AutoRoundModifier
+from compressed_tensors.offload import dispatch_model
+
+# Select model and load it.
+model_id = "Qwen/Qwen3-8B"
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = 128
+MAX_SEQUENCE_LENGTH = 2048
+# Get aligned calibration dataset.
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Configure the quantization algorithm to run.
+#   * quantize the model to W8A8-MXFP8 with AutoRound
+recipe = AutoRoundModifier(
+    targets="Linear",
+    scheme="MXFP8",
+    ignore=["lm_head"],
+    iters=200,
+)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # disable shuffling to get slightly better mmlu score
+    shuffle_calibration_samples=False,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = Path(model_id).name + "-W8A8-MXFP8-AutoRound"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
@@ -440,8 +440,8 @@ def _postprocess_qparams(self, model, llmc_registered_qparams):
                     ar_value = getattr(module, ar_param_name)
                     if ar_value is None:
                         continue
-                    if self.scheme == "MXFP4" and ar_param_name == "scale":
-                        # Convert log2 scale back to normal scale for MXFP4
+                    if self.scheme in ("MXFP4", "MXFP8") and ar_param_name == "scale":
+                        # Convert log2 scale back to normal scale for MXFP4 and MXFP8
                         ar_value = torch.exp2(ar_value.float())
                     if not isinstance(ar_value, torch.Tensor):
                         ar_value = torch.tensor(ar_value)