Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions examples/multimodal_vision/llama4_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,22 @@
from transformers import Llama4ForConditionalGeneration, Llama4Processor

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.quantization import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = Llama4Processor.from_pretrained(model_id)
# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
# This change allows compatibility with vllm.
# To apply your own custom module for experimentation, consider updating
# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
model = replace_modules_for_calibration(model)
# MoE calibration is now handled automatically by the pipeline.
# The `SequentialLlama4TextMoe` modules will be applied during calibration
# to enable proper expert calibration and vLLM compatibility.
#
# NOTE: This restructuring is specifically required for vLLM compatibility
# Users can customize the calibration behavior as needed by modifying the
# To define custom calibration logic, implement your function in
# modeling/llama4.py (e.g., `SequentialLlama4TextMoe`).
# Then, update `MOE_EXPERTS_REPLACEMENT` in prepare.py to reference your
# custom function.

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 512
Expand Down
8 changes: 4 additions & 4 deletions examples/quantization_w4a4_fp4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ We have successfully created an `nvfp4` model!

# Quantizing MoEs

To quantize MoEs, a few additional steps are required. An example quantizing Llama4 can be found under `llama4_example.py`. Here, we replace all `Llama4TextMoe` modules by calling `replace_modules_for_calibration`. This replacement allows us to:
To quantize MoEs, MoE calibration is now handled automatically by the pipeline. An example quantizing Llama4 can be found under `llama4_example.py`. The pipeline automatically applies the appropriate MoE calibration context which:

1. Linearize the model to enable quantization and execution in vLLM. This is required as the native model definition does not include `torch.nn.Linear` layers in its MoE blocks, a requirement for LLM Compressor to run quantization.
2. Ensure experts are quantized correctly as not all experts are activated during calibration
1. Linearizes the model to enable quantization and execution in vLLM. This is required as the native model definition does not include `torch.nn.Linear` layers in its MoE blocks, a requirement for LLM Compressor to run quantization.
2. Ensures experts are quantized correctly as not all experts are activated during calibration

Similarly, an example quantizing the Qwen3-30B-A3B model can be found under `qwen_30b_a3b.py`. This model does not require additional linearization as required by the Llama4 model. However, similar to Llama4, in order to ensure the experts are quantized correctly, we can pass in `calibrate_moe_context` which temporarily updates the model definition to use `Qwen3MoeSparseMoeBlock` which updates how the forward pass is handled in the MoE block during calibration. Feel free to update the definition under `llm-compressor/src/llmcompressor/modeling/qwen3_moe.py` to play around with this behavior and evaluate its impact on quantization performance.
Similarly, an example quantizing the Qwen3-30B-A3B model can be found under `qwen_30b_a3b.py`. This model uses contextual MoE calibration which temporarily updates the model definition to use `Qwen3MoeSparseMoeBlock` which updates how the forward pass is handled in the MoE block during calibration. Feel free to update the definition under `llm-compressor/src/llmcompressor/modeling/qwen3_moe.py` to play around with this behavior and evaluate its impact on quantization performance.


9 changes: 3 additions & 6 deletions examples/quantization_w4a4_fp4/llama4_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,15 @@
from transformers import Llama4ForConditionalGeneration, Llama4Processor

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.quantization import QuantizationModifier

# Select model and load it.
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
processor = Llama4Processor.from_pretrained(model_id)
# We update `Llama4TextMoe` modules with custom `SequentialLlama4TextMoe`.
# This change allows compatibility with vllm.
# To apply your own custom module for experimentation, consider updating
# `SequentialLlama4TextMoe` under llmcompressor/modeling/llama4.py
model = replace_modules_for_calibration(model)
# MoE calibration is now handled automatically by the pipeline.
# The `SequentialLlama4TextMoe` modules will be applied during calibration
# to enable proper expert calibration and vLLM compatibility.

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 20
Expand Down
5 changes: 3 additions & 2 deletions examples/quantizing_moe/deepseek_r1_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.quantization import GPTQModifier

# Select model and load it.
Expand All @@ -20,7 +19,9 @@
model_id, torch_dtype="auto", config=config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = replace_modules_for_calibration(model)
# MoE calibration is now handled automatically by the pipeline.
# The `DeepseekV3MoECalibrate` modules will be applied during calibration
# to enable proper expert calibration.

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
Expand Down
35 changes: 25 additions & 10 deletions src/llmcompressor/args/dataset_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
HuggingFace datasets, custom JSON/CSV files, and DVC-managed datasets.
"""

import warnings
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Union

Expand Down Expand Up @@ -126,16 +127,6 @@ class DatasetArguments(CustomDatasetArguments):
default=512,
metadata={"help": "Number of samples to use for one-shot calibration"},
)
calibrate_moe_context: bool = field(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a dataset argument called moe_calibrate_all_experts which defaults to True?

default=False,
metadata={
"help": "If during calibration, the MoE context should be enabled "
"for the given model. This usually involves updating all MoE modules "
"in the model for the duration of calibration. See moe_context under "
"modeling/prepare.py for a list of supported MoEs and their updated "
"module definitions"
},
)
shuffle_calibration_samples: Optional[bool] = field(
default=True,
metadata={
Expand Down Expand Up @@ -181,6 +172,17 @@ class DatasetArguments(CustomDatasetArguments):
),
},
)
calibrate_moe_context: Optional[bool] = field(
default=None,
metadata={
"help": (
"DEPRECATED: This parameter is deprecated and will be \
removed in a future version. "
"MoE calibration context is now handled automatically by the pipeline. "
"This parameter is ignored and will not affect the calibration process."
),
},
)
# --- pipeline arguments --- #
pipeline: Optional[str] = field(
default="independent",
Expand Down Expand Up @@ -229,3 +231,16 @@ class DatasetArguments(CustomDatasetArguments):

def is_dataset_provided(self) -> bool:
return self.dataset is not None or self.dataset_path is not None

def __post_init__(self):
"""Post-initialization hook to issue deprecation warnings."""
if self.calibrate_moe_context is not None:
warnings.warn(
"The 'calibrate_moe_context' parameter is deprecated\
and will be removed in a future version. "
"MoE calibration context is now handled automatically by the pipeline. "
"This parameter is ignored and will not affect\
the calibration process.",
DeprecationWarning,
stacklevel=2,
)
Loading