Merge branch 'main' into qwen3VLMoE_lineared

dsikka · web-flow · commit 2cb141e1e73f · 2025-10-01T11:03:14.000-04:00
diff --git a/Makefile b/Makefile
@@ -28,10 +28,13 @@ quality:
 	ruff format --check $(CHECKDIRS);
 
 # style the code according to accepted standards for the repo
+# Note: We run `ruff format` twice. Once to fix long lines before lint check
+# and again to fix any formatting issues introduced by ruff check --fix
 style:
 	@echo "Running python styling";
+	ruff format $(CHECKDIRS); 
 	ruff check --fix $(CHECKDIRS);
-	ruff format $(CHECKDIRS);
+	ruff format --silent $(CHECKDIRS); 
 
 # run tests for the repo
 test:
diff --git a/examples/quantization_w4a4_fp4/qwen3_next_example.py b/examples/quantization_w4a4_fp4/qwen3_next_example.py
@@ -0,0 +1,101 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: Qwen3-Next-80B-A3B-Instruct support is not in transformers<=4.56.2
+# you may need to install transformers from source
+
+MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp4 with per group 16 via ptq
+#   * calibrate a global_scale for activations, which will be used to
+#       quantize activations to fp4 on the fly
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4",
+    ignore=[
+        "lm_head",
+        "re:.*mlp.gate$",
+        "re:.*mlp.shared_expert_gate$",
+        "re:.*linear_attn.*",
+    ],
+)
+
+# Apply quantization.
+# We see `calibrate_moe_context` to True to update all `Qwen3MoeSparseMoeBlock`
+# during calibration.
+# Feel free to update the definition under
+# llm-compressor/src/llmcompressor/modeling/qwen3_moe.py` to play around with
+# this behaviour and evaluate its impact on quantization performance
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    calibrate_moe_context=True,
+)
+
+
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/qwen3_next_example.py b/examples/quantization_w8a8_fp8/qwen3_next_example.py
@@ -0,0 +1,45 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.utils import dispatch_for_generation
+
+MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype="auto",
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+recipe = QuantizationModifier(
+    targets=["Linear"],
+    scheme="FP8_DYNAMIC",
+    ignore=[
+        "lm_head",
+        "re:.*mlp.gate$",
+        "re:.*mlp.shared_expert_gate$",
+        "re:.*linear_attn.*",
+    ],
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
@@ -1,6 +1,6 @@
 # Quantizing Mixtral-8x7B-Instruct-v0.1 Model with FP8
 
-This directory contains an example script for quantizing the `Mixtral-8x7B-Instruct-v0.1` model using the static per-tensor FP8 quantization scheme.
+This directory contains example scripts for quantizing LLMs using the static per-tensor FP8 quantization scheme.
 
 ## Installation
 
@@ -32,7 +32,7 @@ python mixtral_example.py
 
 ### Step 1: Select a Model, Dataset, and Recipe
 
-In this step, you'll choose a baseline model for quantization, a dataset for calibration, and a quantization recipe.
+In this step, you'll choose a base model for quantization, a dataset for calibration, and a quantization recipe.
 
 - **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub.
 - **Datasets**: Can also be from a local directory or the Hugging Face Hub.
diff --git a/examples/transform/README.md b/examples/transform/README.md
@@ -0,0 +1,82 @@
+# Applying Transforms to Improve Quantization Accuracy
+
+This directory contains example scripts for applying transforms to models for the purpose of improving quantization accuracy. For more information on transforms, see [QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs](https://arxiv.org/abs/2404.00456). The two transform styles currently supported are SpinQuant/QuaRot-style (`SpinQuantModifier`), and QuIP-style (`QuIPModifier`).
+
+See also [[vLLM Office Hours #31] vLLM and LLM Compressor Update - August 28, 2025](https://www.youtube.com/watch?v=WVenRmF4dPY&list=PLbMP1JcGBmSHxp4-lubU5WYmJ9YgAQcf3&index=3).
+
+## Installation
+
+To get started, install the necessary dependencies by executing the following commands:
+
+```bash
+git clone https://github.com/vllm-project/llm-compressor.git
+cd llm-compressor
+pip install -e .
+```
+
+## Quickstart
+
+The provided example script demonstrates the process for applying quip-style transforms before quantization.
+
+```bash
+python3 quip_example.py
+```
+
+### Step 1: Select a Model, Dataset, and Recipe
+
+In this step, you'll choose a base model for quantization and a transformation + quantization recipe.
+
+- **Models**: Can be referenced from a local directory or retrieved from the Hugging Face Hub.
+- **Recipes**: These are YAML files or Python modifier objects that describe how a model should be optimized during or after training. In this example, we use the `QuIPModifier` applied before the `QuantizationModifier` with the scheme set to `FP8`.
+
+```python
+from llmcompressor.modifiers.transform import QuIPModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+recipe = [
+    QuIPModifier(
+        rotations=["v", "u"], transform_block_size=128, transform_type="hadamard"
+    ),
+    QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+]
+```
+
+Note that `QuIPModifier` can be customized. For a full list of the available arguments, see the [docstring](/src/llmcompressor/modifiers/transform/spinquant/base.py) or documentation.
+
+* `rotations` determines which of the input rotation (v) or output rotations (u) should be used.
+* `transform_block_size` determines the size of the hadamard. Smaller hadamards require less cost at runtime.
+* `transform_type` determines how the transform is constrcted. hadamard uses the sylvester construction.
+
+### Step 2: Run Quantization Using Oneshot
+
+The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be quantized and saved to `Llama-3.1-8B-Instruct-quip-w4a16`. We use the "datafree" pipeline, since our recipe does not require calibration data.
+
+```python
+from llmcompressor import oneshot
+
+# Apply algorithms.
+oneshot(model=model, recipe=recipe, pipeline="datafree")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-quip-w4a16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+### Step 3: Run optimized model in vLLM
+Models optimized with the `hadamard` transform type will be able to leverage the hadacore kernels for accelerated inference. Use the [benchmarks/latency.py](https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/latency.py) script to benchmark latency
+
+```bash
+python3 benchmarks/benchmark_latency.py --model path/to/Llama-3.2-1B-Instruct-quip-w4a16
+```
+
+
+#### Dense Model Latency (sec) ####
+| [Base](https://huggingface.co/meta-llama/Llama-3.2-1B-instruct) | Hadacore | GEMM |
+| - | - | - |
+| 0.4710 | 0.4948 | 1.3946 |
+
+#### Quantized Model Latency (sec) ####
+| Base W4A16 | Hadacore | GEMM |
+| - | - | - |
+| 0.4402 | 0.4489 | 1.2917 |
diff --git a/examples/transform/quip_example.py b/examples/transform/quip_example.py
@@ -20,9 +20,11 @@
 # Configure the quantization algorithm to run.
 #   * apply quip transforms to model in order to make quantization easier
 #   * quantize the weights to 4 bit with a group size 128
+#   * NOTE: if a model has activation shapes not divisble by 2^N, consider using
+#           `random-hadamard` (random hadamard kernels will be added in the future)
 recipe = [
     QuIPModifier(
-        rotations=["v", "u"], transform_block_size=128, transform_type="random-hadamard"
+        rotations=["v", "u"], transform_block_size=128, transform_type="hadamard"
     ),
     QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py
@@ -18,7 +18,9 @@
 #   * quantize the weights to 4 bit with group size 128
 recipe = [
     SpinQuantModifier(
-        rotations=["R1", "R2", "R4"], transform_block_size=64, transform_type="hadamard"
+        rotations=["R1", "R2", "R4"],
+        transform_block_size=128,
+        transform_type="hadamard",
     ),
     QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -201,6 +201,7 @@ class DatasetArguments(CustomDatasetArguments):
             "_prepare_4d_causal_attention_mask",
             "_prepare_fsmt_decoder_inputs",
             "_prepare_4d_causal_attention_mask_with_cache_position",
+            "_update_linear_attn_mask",
         ],
         metadata={
             "help": "List of functions to ignore during tracing, either "
diff --git a/src/llmcompressor/modeling/prepare.py b/src/llmcompressor/modeling/prepare.py
@@ -9,13 +9,15 @@
 
 try:
     from llmcompressor.modeling.qwen3_vl_moe import replace as replace_Qwen3VLMoE
+    from llmcompressor.modeling.qwen3_next_moe import replace as replace_Qwen3NextMoE
 except ImportError:
     logger.warning(
-        "Qwen3-VL-MoE support is not available. "
+        "Qwen3-VL-MoE and Qwen3-Next support is not available. "
         "Please ensure that you have the correct version of transformers installed."
     )
     replace_Qwen3VLMoE = None
-
+    replace_Qwen3NextMoE = None
+    
 from llmcompressor.utils.helpers import patch_attr
 
 __all__ = ["replace_modules_for_calibration"]
@@ -50,28 +52,51 @@ def replace_modules_for_calibration(
 # ------------------- module replacements; during calibration --------------------
 
 
-def update_qwen3_moe(model, stack, calibrate_all_experts):
-    for module in model.modules():
-        cls_name = module.__class__.__name__
-        if cls_name == "Qwen3MoeDecoderLayer":
-            # Optionally update the model.config to pass in other arguments
-            stack.enter_context(
-                patch_attr(
-                    module,
-                    "mlp",
-                    replace_Qwen3MoE(
-                        config=model.config,
-                        module=module.mlp,
-                        calibrate_all_experts=calibrate_all_experts,
-                    ),
-                )
+def update_qwen3_moe(model, module, stack, calibrate_all_experts):
+    cls_name = module.__class__.__name__
+    if (
+        cls_name == "Qwen3MoeDecoderLayer"
+        and module.mlp.__class__.__name__ == "Qwen3MoeSparseMoeBlock"
+    ):
+        stack.enter_context(
+            patch_attr(
+                module,
+                "mlp",
+                replace_Qwen3MoE(
+                    config=model.config,
+                    module=module.mlp,
+                    calibrate_all_experts=calibrate_all_experts,
+                ),
             )
+        )
+
+
+def update_qwen3_next_moe(model, module, stack, calibrate_all_experts):
+    cls_name = module.__class__.__name__
+    if (
+        cls_name == "Qwen3NextDecoderLayer"
+        and module.mlp.__class__.__name__ == "Qwen3NextSparseMoeBlock"
+    ):
+        stack.enter_context(
+            patch_attr(
+                module,
+                "mlp",
+                replace_Qwen3NextMoE(
+                    config=model.config,
+                    module=module.mlp,
+                    calibrate_all_experts=calibrate_all_experts,
+                ),
+            )
+        )
 
 
 moe_context = {
     "Qwen3MoeForCausalLM": update_qwen3_moe,
 }
 
+if replace_Qwen3NextMoE is not None:
+    moe_context["Qwen3NextForCausalLM"] = update_qwen3_next_moe
+
 
 def moe_calibration_context(
     model: PreTrainedModel,
@@ -80,6 +105,7 @@ def moe_calibration_context(
 ):
     # Temporarily updates the MoE modules within the context
     # Once the context exists, parameter updates persist
-    cls_name = model.__class__.__name__
-    if cls_name in moe_context:
-        moe_context.get(cls_name)(model, stack, calibrate_all_experts)
+    model_name = model.__class__.__name__
+    if model_name in moe_context:
+        for module in model.modules():
+            moe_context[model_name](model, module, stack, calibrate_all_experts)
diff --git a/src/llmcompressor/modeling/qwen3_next_moe.py b/src/llmcompressor/modeling/qwen3_next_moe.py