[MTP] Add MTP Layers to final checkpoint (#2486)

dsikka · kylesayrs · web-flow · commit e97c50a3236d · 2026-03-24T12:46:44.000-04:00
SUMMARY: - Update examples to save mtp layers; requires: vllm-project/compressed-tensors#640 - Fix how the MoE example handles the processor - Update repo readme with Qwen 3.5 details --------- Signed-off-by: Dipika Sikka <ds3822@columbia.edu> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ Big updates have landed in LLM Compressor! To get a more in-depth look, check ou
 
 Some of the exciting new features include:
 
+* **Qwen3.5 Support**: Qwen 3.5 can now be quantized using LLM Compressor. You will need to update your local transformers version using `uv pip install --upgrade transformers` and install LLM Compressor from source if using `<0.11`. Once updated, you should be able to run examples for the [MoE](examples/quantization_w4a4_fp4/qwen3_5_example.py) and [non-MoE](examples/quantization_w4a4_fp4/qwen3_5_example.py) variants of Qwen 3.5 end-to-end. For models quantized and published by the RedHat team, consider using the [NVFP4](https://huggingface.co/RedHatAI/Qwen3.5-122B-A10B-NVFP4) and FP8 checkpoints for [Qwen3.5-122B](https://huggingface.co/RedHatAI/Qwen3.5-122B-A10B-FP8-dynamic) and [Qwen3.5-397B](https://huggingface.co/RedHatAI/Qwen3.5-397B-A17B-FP8-dynamic).
 * **Updated offloading and model loading support**: Loading transformers models that are offloaded to disk and/or offloaded across distributed process ranks is now supported. Disk offloading allows users to load and compress very large models which normally would not fit in CPU memory. Offloading functionality is no longer supported through accelerate but through model loading utilities added to compressed-tensors. For a full summary of updated loading and offloading functionality, for both single-process and distributed flows, see the [Big Models and Distributed Support guide](docs/guides/big_models_and_distributed/model_loading.md).
 * **Distributed GPTQ Support**: GPTQ now supports Distributed Data Parallel (DDP) functionality to significantly improve calibration runtime. An example using DDP with GPTQ can be found [here](examples/quantization_w4a16/llama3_ddp_example.py).
 * **Updated FP4 Microscale Support**: GPTQ now supports FP4 quantization schemes, including both [MXFP4](examples/quantization_w4a16_fp4/mxfp4/llama3_example.py) and [NVFP4](examples/quantization_w4a4_fp4/llama3_gptq_example.py). MXFP4 support has also been improved with updated weight scale generation. Models with weight-only quantization in the MXFP4 format can now run in vLLM as of vLLM v0.14.0. MXFP4 models with activation quantization are not yet supported in vLLM for compressed-tensors models
diff --git a/examples/quantization_w4a16_fp4/mxfp4/qwen3.5_example.py b/examples/quantization_w4a16_fp4/mxfp4/qwen3.5_example.py
@@ -1,4 +1,5 @@
 from compressed_tensors.offload import dispatch_model
+from compressed_tensors.utils import save_mtp_tensors_to_checkpoint
 from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
 
 from llmcompressor import oneshot
@@ -14,16 +15,18 @@
 # Configure the quantization algorithm and scheme.
 # In this case, we:
 #   * quantize the weights to fp4 with per group 32 via ptq
-#   * skip the visual encoder, lm_head, linear attention (Gated DeltaNet
-#     fused projections are incompatible with microscale formats), and MTP modules
+#   * skip the visual encoder, lm_head, and linear attention
+#   (Gated DeltaNet fused projections are incompatible with microscale formats)
+
+# No need to include mtp layers as they are not loaded
+# through Qwen3_5ForConditionalGeneration
 recipe = QuantizationModifier(
     targets="Linear",
     scheme="MXFP4A16",
     ignore=[
         "lm_head",
         "re:.*visual.*",
         "re:.*linear_attn.*",
-        "re:.*mtp.*",
     ],
 )
 
@@ -45,3 +48,7 @@
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-MXFP4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
+
+# MTP layers are excluded from the model through Qwen3_5ForConditionalGeneration
+# Save them as-is from the original checkpoint into the quantized output.
+save_mtp_tensors_to_checkpoint(source_model=MODEL_ID, dest_dir=SAVE_DIR)
diff --git a/examples/quantization_w4a16_fp4/nvfp4/qwen3.5_example.py b/examples/quantization_w4a16_fp4/nvfp4/qwen3.5_example.py
@@ -1,4 +1,5 @@
 from compressed_tensors.offload import dispatch_model
+from compressed_tensors.utils import save_mtp_tensors_to_checkpoint
 from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
 
 from llmcompressor import oneshot
@@ -14,16 +15,18 @@
 # Configure the quantization algorithm and scheme.
 # In this case, we:
 #   * quantize the weights to fp4 with per group 16 via ptq
-#   * skip the visual encoder, lm_head, linear attention (Gated DeltaNet
-#     fused projections are incompatible with NVFP4), and MTP modules
+#   * skip the visual encoder, lm_head, linear attention
+#   (Gated DeltaNet fused projections are incompatible with microscale formats)
+
+# No need to include mtp layers as they are not loaded
+# through Qwen3_5ForConditionalGeneration
 recipe = QuantizationModifier(
     targets="Linear",
     scheme="NVFP4A16",
     ignore=[
         "lm_head",
         "re:.*visual.*",
         "re:.*linear_attn.*",
-        "re:.*mtp.*",
     ],
 )
 
@@ -45,3 +48,7 @@
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
+
+# MTP layers are excluded from the model through Qwen3_5ForConditionalGeneration
+# Save them as-is from the original checkpoint into the quantized output.
+save_mtp_tensors_to_checkpoint(source_model=MODEL_ID, dest_dir=SAVE_DIR)
diff --git a/examples/quantization_w4a4_fp4/qwen3_5_example.py b/examples/quantization_w4a4_fp4/qwen3_5_example.py
@@ -1,5 +1,7 @@
+import torch
+from compressed_tensors.utils import save_mtp_tensors_to_checkpoint
 from datasets import load_dataset
-from transformers import AutoTokenizer, Qwen3_5MoeForConditionalGeneration
+from transformers import AutoProcessor, Qwen3_5MoeForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
@@ -10,9 +12,10 @@
 
 # Load model.
 model = Qwen3_5MoeForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
-processor = AutoTokenizer.from_pretrained(MODEL_ID)
-
+processor = AutoProcessor.from_pretrained(MODEL_ID)
 
+# No need to include mtp layers as they are not loaded
+# through Qwen3_5MoeForConditionalGeneration
 recipe = QuantizationModifier(
     targets="Linear",
     scheme="NVFP4",
@@ -30,44 +33,39 @@
 NUM_CALIBRATION_SAMPLES = 256
 MAX_SEQUENCE_LENGTH = 4096
 
-# Load datasets and preprocess.
-samples_per_dataset = NUM_CALIBRATION_SAMPLES
-
-ds_ultrachat = load_dataset(
+ds = load_dataset(
     "HuggingFaceH4/ultrachat_200k",
-    split=f"train_sft[:{samples_per_dataset}]",
+    split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]",
 )
-
-# Both datasets share a "messages" column with the same chat format.
-# Keep only that column so we can concatenate them.
-ds = ds_ultrachat.select_columns(["messages"])
+ds = ds.select_columns(["messages"])
 ds = ds.shuffle(seed=42)
 
 
-def preprocess(example):
-    return {
-        "text": processor.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return processor(
-        sample["text"],
+def preprocess_function(example):
+    messages = [
+        {"role": m["role"], "content": [{"type": "text", "text": m["content"]}]}
+        for m in example["messages"]
+    ]
+    return processor.apply_chat_template(
+        messages,
+        return_tensors="pt",
         padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
         truncation=True,
+        max_length=MAX_SEQUENCE_LENGTH,
+        tokenize=True,
         add_special_tokens=False,
+        return_dict=True,
+        add_generation_prompt=False,
     )
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
+
+
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
 
 # Apply quantization.
 oneshot(
@@ -77,9 +75,14 @@ def tokenize(sample):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     moe_calibrate_all_experts=True,
+    data_collator=data_collator,
 )
 
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)
+
+# MTP layers are excluded from the model through Qwen3_5MoeForConditionalGeneration
+# Save them as-is from the original checkpoint into the quantized output.
+save_mtp_tensors_to_checkpoint(source_model=MODEL_ID, dest_dir=SAVE_DIR)
diff --git a/src/llmcompressor/entrypoints/model_free/__init__.py b/src/llmcompressor/entrypoints/model_free/__init__.py
@@ -7,11 +7,13 @@
 from compressed_tensors.entrypoints.convert import (
     Converter,
     exec_jobs,
+)
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.utils.safetensors_load import (
     get_checkpoint_files,
     is_weights_file,
     update_safetensors_index,
 )
-from compressed_tensors.quantization import QuantizationScheme
 from loguru import logger
 
 from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
diff --git a/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py b/src/llmcompressor/entrypoints/model_free/reindex_fused_weights.py
@@ -7,7 +7,7 @@
 
 import torch
 import tqdm
-from compressed_tensors.entrypoints.convert import (
+from compressed_tensors.utils.safetensors_load import (
     get_checkpoint_files,
     is_weights_file,
     update_safetensors_index,
diff --git a/src/llmcompressor/entrypoints/model_free/save_utils.py b/src/llmcompressor/entrypoints/model_free/save_utils.py
@@ -9,12 +9,13 @@
     TRANSFORM_CONFIG_NAME,
 )
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.entrypoints.convert import Converter, find_config_path
+from compressed_tensors.entrypoints.convert import Converter
 from compressed_tensors.quantization import (
     QuantizationConfig,
     QuantizationScheme,
     QuantizationStatus,
 )
+from compressed_tensors.utils.safetensors_load import find_config_path
 from loguru import logger
 from pydantic import ValidationError
 

Original file line number	Diff line number	Diff line change
`@@ -9,12 +9,13 @@`
`9`	`9`	`TRANSFORM_CONFIG_NAME,`
`10`	`10`	`)`
`11`	`11`	`from compressed_tensors.config import CompressionFormat`
`12`		`-from compressed_tensors.entrypoints.convert import Converter, find_config_path`
	`12`	`+from compressed_tensors.entrypoints.convert import Converter`
`13`	`13`	`from compressed_tensors.quantization import (`
`14`	`14`	`QuantizationConfig,`
`15`	`15`	`QuantizationScheme,`
`16`	`16`	`QuantizationStatus,`
`17`	`17`	`)`
	`18`	`+from compressed_tensors.utils.safetensors_load import find_config_path`
`18`	`19`	`from loguru import logger`
`19`	`20`	`from pydantic import ValidationError`
`20`	`21`