Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9f0e0ac
match_named_modules, add observer on_start instead of on_initialize
brian-dellabetta Aug 14, 2025
14486af
scoped quant status/config
brian-dellabetta Aug 20, 2025
ff5067a
scoped GPTQModifier
brian-dellabetta Aug 21, 2025
f99db2f
style fixes
brian-dellabetta Aug 21, 2025
5da7b6d
multi-modifier example
brian-dellabetta Sep 15, 2025
32ad8dc
revert assert check in GPTQ
brian-dellabetta Sep 15, 2025
faee70f
Merge branch 'main' into bdellabe/scoped-quant-status
brian-dellabetta Sep 15, 2025
4db397b
stylefix examples
brian-dellabetta Sep 15, 2025
1c7ae4d
Merge branch 'main' into bdellabe/scoped-quant-status
brian-dellabetta Sep 16, 2025
64f8f39
KVCacheScaleType import update
brian-dellabetta Sep 17, 2025
a892d2b
Merge branch 'main' into bdellabe/scoped-quant-status
brian-dellabetta Sep 17, 2025
1d3eceb
codereview multi_modifier -> mixed_precision
brian-dellabetta Sep 18, 2025
75c7ca6
saved model name
brian-dellabetta Sep 18, 2025
81cf4a1
GPTQ validation layer
brian-dellabetta Sep 18, 2025
af6a34b
test fixes
brian-dellabetta Sep 18, 2025
2cc681f
remove TODOs
brian-dellabetta Sep 18, 2025
855606e
revert GPTQ validation changes, fix failing transformers tests
brian-dellabetta Sep 18, 2025
b25d23b
compresstion test fixes
brian-dellabetta Sep 18, 2025
1a6eca7
merge main
brian-dellabetta Sep 19, 2025
50fbf15
move exampe to quantization_non_uniform
brian-dellabetta Sep 19, 2025
a0568f7
QuantizationMixin targets resolution
brian-dellabetta Sep 19, 2025
5e5e0fe
style fixes
brian-dellabetta Sep 19, 2025
6cd0350
quant mixin updates
brian-dellabetta Sep 22, 2025
a2377d9
Merge branch 'main' into bdellabe/scoped-quant-status
brian-dellabetta Sep 22, 2025
1619337
Quant mixin targets validation
brian-dellabetta Sep 22, 2025
e21a933
remove extraneous awq changes
brian-dellabetta Sep 22, 2025
c437c6f
move validation out of resolve quantization config
brian-dellabetta Sep 22, 2025
4326ee3
remove validation error
brian-dellabetta Sep 22, 2025
2611ac6
moved quant config call
brian-dellabetta Sep 22, 2025
2ea5698
retain validation error
brian-dellabetta Sep 22, 2025
33695d5
don't call resolve config in validation layer
brian-dellabetta Sep 22, 2025
170f04b
minor refactor for when model.config_groups is None
brian-dellabetta Sep 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/multi_modifier/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Quantizing Models with Multiple Quantization Modifiers #

This section outlines how multiple quantization modifiers can be applied to the same model, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. The heterogeneous application of multiple modifiers comes in 2 flavors:

1. Run every modifier in a single, sequential pipeline, performing a single calibrated run. See `./llama3_example.py` for an example.
2. Run each modifier in its own, independent pipeline, performing a calibrated run for each modifier. To run each modifier independently, run `./llama3_example.py` with `oneshot(..., pipeline="independent")` instead of `pipeline="sequential"`.

This is an advanced usage of `llm-compressor` and an active area of research. Best practices will be provided in a future release, after further research and sensitivity analysis.
101 changes: 101 additions & 0 deletions examples/multi_modifier/llama3_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
# * quantize self_attn layers to W8A8 with GPTQ
# * quantize mlp layers to W4A16 with AWQ
# only include mappings pertaining to target layers
recipe = [
GPTQModifier(targets=r"re:.*self_attn\.(k|q|o|v)_proj$", scheme="W8A8"),
AWQModifier(
targets=r"re:.*mlp\.(down|gate|up)_proj$",
mappings=[
AWQMapping(
"re:.*post_attention_layernorm$",
["re:.*gate_proj$", "re:.*up_proj$"],
),
AWQMapping(
"re:.*up_proj$",
["re:.*down_proj$"],
),
],
scheme="W4A16",
),
]

# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# Option 1) run both modifiers in a single calibrated run
pipeline="sequential",
# Option 2) run each modifier in its own separate pipeline
# pipeline="independent",
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to(model.device) for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
6 changes: 4 additions & 2 deletions src/llmcompressor/modifiers/awq/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,10 @@ def on_end(self, state: State, event: Event, **kwargs):

self.ended_ = True

modules = list(state.model.modules())
for module in tqdm(modules, desc="Calibrating weights"):
for _, module in tqdm(
match_named_modules(state.model, self.targets, self.ignore),
desc="Calibrating weights",
):
update_weight_zp_scale(module)

QuantizationMixin.end_calibration(self, state.model)
Expand Down
1 change: 1 addition & 0 deletions src/llmcompressor/modifiers/awq/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ class AWQMapping:
"Phi3ForCausalLM": _phi_mappings,
"Phi3VForCausalLM": _phi_mappings,
"Qwen2ForCausalLM": _default_mappings,
"Qwen2_5OmniThinkerForConditionalGeneration": _default_mappings,
"Qwen2MoeForCausalLM": _moe_default_mappings,
"Qwen3ForCausalLM": _default_mappings,
"Qwen3MoeForCausalLM": _moe_default_mappings,
Expand Down
8 changes: 6 additions & 2 deletions src/llmcompressor/modifiers/quantization/gptq/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
align_module_device,
get_execution_device,
getattr_chain,
match_named_modules,
update_offload_parameter,
)
from loguru import logger
Expand Down Expand Up @@ -165,7 +166,10 @@ def on_initialize(self, state: State, **kwargs) -> bool:
QuantizationMixin.initialize_quantization(self, state.model)

# prepare module names
self._module_names = {m: name for name, m in state.model.named_modules()}
self._module_names = {
m: name
for name, m in match_named_modules(state.model, self.targets, self.ignore)
}

return True

Expand All @@ -178,7 +182,7 @@ def on_start(self, state: State, event: Event, **kwargs):

# register gptq hooks
added_hook = False
for module in state.model.modules():
for _, module in match_named_modules(state.model, self.targets, self.ignore):
if getattr_chain(module, "quantization_scheme.weights", None) is not None:
# HACK: previously, embeddings were not quantized because they were not
# accessible by the layer compressor. For now, we manually ignore it,
Expand Down
9 changes: 6 additions & 3 deletions src/llmcompressor/modifiers/quantization/quantization/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import tqdm
from compressed_tensors.utils import match_named_modules

from llmcompressor.core import Event, EventType, State
from llmcompressor.modifiers import Modifier
Expand Down Expand Up @@ -69,14 +70,16 @@ def on_start(self, state: State, event: Event, **kwargs):
self.started_ = True
QuantizationMixin.start_calibration(self, state.model)

modules = list(state.model.modules())
named_modules = list(
match_named_modules(state.model, self.targets, self.ignore)
)
# TODO: this step can be combined with update_weight_zp_scale
# once update_fused_layer_weight_global_scales is removed
# and not required by vLLM
for module in tqdm.tqdm(modules):
for _, module in tqdm.tqdm(named_modules):
update_weight_global_scale(module)

for module in tqdm.tqdm(modules, desc="Calibrating weights"):
for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
update_fused_layer_weight_global_scales(module)
update_weight_zp_scale(module)

Expand Down
33 changes: 21 additions & 12 deletions src/llmcompressor/modifiers/quantization/quantization/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
is_preset_scheme,
preset_name_to_scheme,
)
from compressed_tensors.utils import match_named_modules
from pydantic import Field, PrivateAttr, field_validator
from torch.utils.hooks import RemovableHandle

Expand Down Expand Up @@ -116,41 +117,49 @@ def validate_scheme(

def initialize_quantization(self, model: torch.nn.Module):
"""
Attach quantization schemes and observers to modules in the model according to
Attach quantization schemes to modules in the model according to
the quantization config specified on this modifier

:param model: model to attach schemes and observers to
"""
reset_quantization_status(model) # reset any previously applied qconfigs

# apply scheme and status to model
config = self.resolve_quantization_config()

for _, module in match_named_modules(model, self.targets, self.ignore):
reset_quantization_status(module) # reset any previously applied qconfigs

apply_quantization_config(model, config)

# apply observers, disable quantization until calibration
model.apply(self._initialize_observers)
# TODO should we disable for entire model or just matching modules?
# disable quantization until calibration
model.apply(disable_quantization)

def start_calibration(self, model: torch.nn.Module):
"""
Register activation calibration hooks (including kv_cache quantization) and
enable quantization as we calibrate
Attach observers, register activation calibration hooks (including
kv_cache quantization) and enable quantization as we calibrate

:param model: model to prepare for calibration
"""
self._calibration_hooks = self._initialize_hooks(model)
model.apply(apply_calibration_status)
for _, module in match_named_modules(model, self.targets, self.ignore):
self._initialize_observers(module)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we keep this iniitialize_quantization?

Copy link
Collaborator Author

@brian-dellabetta brian-dellabetta Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

observers should be initialized on start to align with them being removed on_end. so this was moved into on_start instead. without this change the lifecycle with multiple quant modifiers will trigger observer hooks before the modifier starts (before it sees any data), which can now happen in a previous modifier lifecycle

apply_calibration_status(module)

# TODO should we disable for entire model or just matching modules?
model.apply(enable_quantization) # quantize at the same time as calibrate

def end_calibration(self, model: torch.nn.Module):
"""
Remove calibration hooks and set the model status to frozen. Keep quantization
enabled for future operations
Remove calibration hooks and observers, and set the model status to frozen.
Keep quantization enabled for future operations

:param model: model to end calibration for
"""
self.remove_hooks(self._calibration_hooks)
model.apply(freeze_module_quantization) # remove observers
for _, module in match_named_modules(model, self.targets, self.ignore):
freeze_module_quantization(module) # remove observers

model.apply(enable_quantization) # keep quantization enabled

def has_config(self) -> bool:
Expand Down Expand Up @@ -240,7 +249,7 @@ def _initialize_observers(self, module: torch.nn.Module):

def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
hooks = set()
for module in model.modules():
for _, module in match_named_modules(model, self.targets, self.ignore):
if not hasattr(module, "quantization_scheme"):
continue

Expand Down
Loading