Skip to content
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
bc97d48
update autoround version
yiliu30 Nov 21, 2025
19ab4f2
Merge branch 'main' into autoround-version
yiliu30 Nov 21, 2025
9ba113c
expose bs
yiliu30 Nov 24, 2025
646982a
Merge branch 'autoround-version' of https://github.com/yiliu30/llm-co…
yiliu30 Nov 24, 2025
1050335
use 0.9.1
yiliu30 Nov 26, 2025
50e6682
fix
yiliu30 Nov 27, 2025
d139071
update
yiliu30 Nov 27, 2025
a0affbd
enable auto-dispatch
yiliu30 Dec 2, 2025
17ba9f5
add ds example
yiliu30 Dec 2, 2025
cd943cd
merge main
yiliu30 Dec 15, 2025
8338ed5
pass ignore to ar
yiliu30 Dec 15, 2025
56515af
add qwen example
yiliu30 Dec 15, 2025
ad6c1c0
update example
yiliu30 Dec 15, 2025
09a72c0
format
yiliu30 Dec 15, 2025
af112bd
update
yiliu30 Dec 15, 2025
ec98118
refine suspend hook
yiliu30 Dec 17, 2025
aa50449
Modernize type hints in logarithmic equalization modifier (#2121)
aaarrvind Dec 17, 2025
c5eae60
update
yiliu30 Dec 18, 2025
2d482fc
clean code
yiliu30 Dec 18, 2025
17b7e45
add ut
yiliu30 Dec 18, 2025
7a9b3cd
fix
yiliu30 Dec 18, 2025
4f45b17
fix hint
yiliu30 Dec 18, 2025
0fac601
refine
yiliu30 Dec 18, 2025
0f7a990
speedup ut
yiliu30 Dec 18, 2025
3f25fd1
Modernize transformers module with type hints and generic types (#2034)
sugatmahanti Dec 18, 2025
5f6c8db
fp8 awq examples (#2145)
HDCharles Dec 18, 2025
58ef017
clean
yiliu30 Dec 19, 2025
c9ea99c
add docstring
yiliu30 Dec 19, 2025
d2a7c92
format
yiliu30 Dec 19, 2025
d48c3d6
Merge branch 'main' into auto-device
yiliu30 Dec 19, 2025
2427825
Update lm-eval set-up to address regression (#2142)
dsikka Dec 19, 2025
5a29932
Add HF token to prevent tests from skipping (#2141)
dsikka Dec 19, 2025
6264c59
[awq] simplify compute_layer_means (#2128)
HDCharles Dec 19, 2025
6bb7905
[CPU] Linearize gpt_oss model and add example to quantize it to w4a8 …
isharif168 Dec 19, 2025
993a68e
Merge branch 'main' into auto-device
yiliu30 Dec 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions examples/autoround/deepseek_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from auto_round.calib_dataset import get_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.autoround import AutoRoundModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The model_id is hardcoded to a local path, which makes this example not portable or runnable for other users. It's better to default to a model identifier from the Hugging Face Hub and provide the local path as a commented-out alternative for users who wish to use a local model.

Suggested change
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"

# model_id = "/storage/yiliu7/deepseek-ai/DeepSeek-V2-Lite-Chat/"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The model_id is set to a hardcoded local path, which makes this example not portable. Please use a single, public model identifier from the Hugging Face Hub and remove the other commented or overwritten assignments for clarity.

Suggested change
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
# model_id = "/storage/yiliu7/deepseek-ai/DeepSeek-V2-Lite-Chat/"
model_id = "deepseek-ai/DeepSeek-V2-Lite-Chat"

model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype="auto", trust_remote_code=True
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using trust_remote_code=True can introduce a security vulnerability if the model repository contains malicious code. It is crucial to warn users about this risk, especially in an example script that they might copy and run. Please add a comment explaining that users should only enable this if they trust the source of the model.

)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Select calibration dataset.
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 2048
# Get aligned calibration dataset.

ds = get_dataset(
tokenizer=tokenizer,
seqlen=MAX_SEQUENCE_LENGTH,
nsamples=NUM_CALIBRATION_SAMPLES,
)


# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with AutoRound with a group size 128
recipe = AutoRoundModifier(
targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=32, device_map="0,1"
)


# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# disable shuffling to get slightly better mmlu score
shuffle_calibration_samples=False,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to(model.device) for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
71 changes: 71 additions & 0 deletions examples/autoround/qwen3_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from auto_round.calib_dataset import get_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.autoround import AutoRoundModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
model_id = "Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The model_id is set to a hardcoded local path, which makes this example not portable. Please use a single, public model identifier from the Hugging Face Hub and remove the other commented or overwritten assignments for clarity. Using a smaller model might also be more suitable for an example script.

Suggested change
model_id = "Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
model_id = "Qwen/Qwen1.5-0.5B-Chat"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The model_id is hardcoded to a local path, which makes this example not runnable for other users. It's better to default to a model from the Hugging Face Hub and provide the local path as a commented-out alternative.

Suggested change
model_id = "Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
model_id = "Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
# model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Select calibration dataset.
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 2048
# Get aligned calibration dataset.

ds = get_dataset(
tokenizer=tokenizer,
seqlen=MAX_SEQUENCE_LENGTH,
nsamples=NUM_CALIBRATION_SAMPLES,
)


# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with AutoRound with a group size 128
recipe = AutoRoundModifier(
targets="Linear",
scheme="W4A16",
ignore=[
"lm_head",
"re:.*mlp.gate$",
# "re:.*.gate_proj$"
],
iters=1,
enable_torch_compile=False,
device_map="0,1",
)


# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# disable shuffling to get slightly better mmlu score
shuffle_calibration_samples=False,
)

# Save to disk compressed.
SAVE_DIR = (
"/storage/yiliu7/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The SAVE_DIR is constructed with a hardcoded absolute path prefix, which will cause the script to fail on other machines. It should be a relative path so the model is saved in the current working directory.

SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The SAVE_DIR is constructed using a hardcoded absolute path. This will cause the script to fail for any user who does not have the /storage/yiliu7/ directory. The output directory should be a relative path to make the example portable.

SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"

print(f"save to {SAVE_DIR}")
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to(model.device) for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
57 changes: 52 additions & 5 deletions src/llmcompressor/modifiers/autoround/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from contextlib import contextmanager
from typing import Dict, List, Optional, Tuple, Union

import torch
from accelerate.hooks import add_hook_to_module, remove_hook_from_submodules
from auto_round import AutoRound
from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
from compressed_tensors.quantization import (
Expand Down Expand Up @@ -54,6 +56,36 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
return wrapped_model


import torch.nn as nn


@contextmanager
def suspend_accelerate_hooks(model: nn.Module):
"""
Context manager to temporarily detach Accelerate hooks (e.g., offloading,
casting) and automatically restore them upon exit.
"""
saved_hooks = {}

# 1. Capture existing hooks
for _, module in model.named_modules():
if hasattr(module, "_hf_hook"):
saved_hooks[module] = module._hf_hook

# 2. Detach hooks for the duration of the context
remove_hook_from_submodules(model)

try:
yield
finally:
# 3. Ensure a clean slate (remove any hooks added inside the block)
remove_hook_from_submodules(model)

# 4. Re-attach the original hooks
for module, hook in saved_hooks.items():
add_hook_to_module(module, hook, append=True)


class AutoRoundModifier(Modifier, QuantizationMixin):
"""
Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf.
Expand Down Expand Up @@ -110,6 +142,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
iters: int = 200
enable_torch_compile: bool = True
batch_size: int = 8
device_map: str = "0"
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The default value for device_map is hardcoded to '0'. This assumes a GPU is available at index 0 and will cause a crash on CPU-only systems or systems with a different GPU configuration. Consider changing the default to 'auto' to let accelerate handle device placement automatically, which is more robust and user-friendly.

Suggested change
device_map: str = "0"
device_map: str = "auto"


# private variables
_all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
Expand Down Expand Up @@ -215,15 +248,20 @@ def apply_autoround(self, state, subgraph):
wrapped_model = _wrap_decoding_layer(decoding_layer)
wrapped_model.name_or_path = state.model.name_or_path

with torch.enable_grad(), align_module_device(decoding_layer):
with torch.enable_grad(), align_module_device(
decoding_layer
), suspend_accelerate_hooks(wrapped_model):
ar_quant_scheme = self._mapping_config_to_autoround()
fp_layers = self.get_unquantized_layer_names(decoding_layer)
ar = AutoRound(
model=wrapped_model,
tokenizer="",
scheme=ar_quant_scheme,
iters=self.iters,
enable_torch_compile=self.enable_torch_compile,
batch_size=self.batch_size,
device_map=self.device_map,
fp_layers=",".join(fp_layers) if fp_layers else "",
)
# TODO: configure layer-wise config based on self.resolved_config
ar.configure_layer_config(enable_gguf_official_mixed=False)
Expand All @@ -239,14 +277,12 @@ def apply_autoround(self, state, subgraph):
q_input=self._q_input,
device=str(device),
# Leave offload for LLMC
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The comment # Leave offload for LLMC is now misleading since auto_offload is set to True. With the addition of suspend_accelerate_hooks, it seems the intention is now to use auto_round's internal offloading. The comment should be updated to reflect this change in behavior.

Suggested change
# Leave offload for LLMC
# Use auto_round's internal offloading

auto_offload=False,
auto_offload=True,
)
self._q_input = q_input
# Update offload parameters and remove temporary attributes
for _, module in decoding_layer.named_modules():
if hasattr(module, "weight_scale") and hasattr(
module, "weight_zero_point"
):
if hasattr(module, "scale") and hasattr(module, "weight_zero_point"):
# Note: The model's weight is already q-dq in-place by auto-round.
weight_scale = module.scale
del module.scale
Expand Down Expand Up @@ -278,6 +314,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:

return True

def get_unquantized_layer_names(self, wrapped_model) -> List[str]:
unquantized_layers = []

for name, module in wrapped_model.named_modules():
if (
module.__class__.__name__ in self.resolved_targets
and getattr(module, "quantization_scheme", None) is None
):
unquantized_layers.append(name)
return unquantized_layers

def _add_temporary_names(self, model: torch.nn.Module):
for name, mod in model.named_modules():
mod._tmp_name = name
Expand Down
Loading