yiliu30 · yiliu30 · Nov 21, 2025 · Nov 21, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/examples/autoround/deepseek_example.py b/examples/autoround/deepseek_example.py
@@ -0,0 +1,60 @@
+from auto_round.calib_dataset import get_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.autoround import AutoRoundModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+# model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+# model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
+# model_id = "/storage/yiliu7/deepseek-ai/DeepSeek-V2-Lite-Chat/"
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
-# model_id = "/storage/yiliu7/deepseek-ai/DeepSeek-V2-Lite-Chat/"
+model_id = "deepseek-ai/DeepSeek-V2-Lite-Chat"
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-model_id = "/storage/yiliu7/unsloth/DeepSeek-R1-BF16"
-# model_id = "/storage/yiliu7/deepseek-ai/DeepSeek-V2-Lite-Chat/"
+model_id = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, torch_dtype="auto", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = 128
+MAX_SEQUENCE_LENGTH = 2048
+# Get aligned calibration dataset.
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
+recipe = AutoRoundModifier(
+    targets="Linear", scheme="W4A16", ignore=["lm_head"], iters=32, device_map="0,1"
+)
+
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # disable shuffling to get slightly better mmlu score
+    shuffle_calibration_samples=False,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/autoround/qwen3_example.py b/examples/autoround/qwen3_example.py
@@ -0,0 +1,71 @@
+from auto_round.calib_dataset import get_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.autoround import AutoRoundModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+model_id = "Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
+model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
-model_id = "Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
-model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
+model_id = "Qwen/Qwen1.5-0.5B-Chat"
-model_id = "Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
-model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
+model_id = "Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
+# model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
-model_id = "Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
-model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
+model_id = "Qwen/Qwen1.5-0.5B-Chat"
-model_id = "Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
-# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
-model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
+model_id = "Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen3-30B-A3B"
+# model_id = "/storage/yiliu7/Qwen/Qwen2.5-0.5B/"
+# model_id = "/storage/yiliu7/Qwen/Qwen3-235B-A22B/"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = 128
+MAX_SEQUENCE_LENGTH = 2048
+# Get aligned calibration dataset.
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
+recipe = AutoRoundModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=[
+        "lm_head",
+        "re:.*mlp.gate$",
+        # "re:.*.gate_proj$"
+    ],
+    iters=1,
+    enable_torch_compile=False,
+    device_map="0,1",
+)
+
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    # disable shuffling to get slightly better mmlu score
+    shuffle_calibration_samples=False,
+)
+
+# Save to disk compressed.
+SAVE_DIR = (
+    "/storage/yiliu7/" + model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
+)
+print(f"save to {SAVE_DIR}")
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
diff --git a/src/llmcompressor/modifiers/autoround/base.py b/src/llmcompressor/modifiers/autoround/base.py
@@ -1,6 +1,8 @@
+from contextlib import contextmanager
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+from accelerate.hooks import add_hook_to_module, remove_hook_from_submodules
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme as ARQuantizationScheme
 from compressed_tensors.quantization import (
@@ -54,6 +56,36 @@ def _wrap_decoding_layer(layer: torch.nn.Module) -> _PretrainModelWrapper:
     return wrapped_model
 
 
+import torch.nn as nn
+
+
+@contextmanager
+def suspend_accelerate_hooks(model: nn.Module):
+    """
+    Context manager to temporarily detach Accelerate hooks (e.g., offloading,
+    casting) and automatically restore them upon exit.
+    """
+    saved_hooks = {}
+
+    # 1. Capture existing hooks
+    for _, module in model.named_modules():
+        if hasattr(module, "_hf_hook"):
+            saved_hooks[module] = module._hf_hook
+
+    # 2. Detach hooks for the duration of the context
+    remove_hook_from_submodules(model)
+
+    try:
+        yield
+    finally:
+        # 3. Ensure a clean slate (remove any hooks added inside the block)
+        remove_hook_from_submodules(model)
+
+        # 4. Re-attach the original hooks
+        for module, hook in saved_hooks.items():
+            add_hook_to_module(module, hook, append=True)
+
+
 class AutoRoundModifier(Modifier, QuantizationMixin):
     """
     Implements the AutoRound algorithm from https://aclanthology.org/2024.findings-emnlp.662.pdf.
@@ -110,6 +142,7 @@ class AutoRoundModifier(Modifier, QuantizationMixin):
     iters: int = 200
     enable_torch_compile: bool = True
     batch_size: int = 8
+    device_map: str = "0"
-    device_map: str = "0"
+    device_map: str = "auto"
-    device_map: str = "0"
+    device_map: str = "auto"
 
     # private variables
     _all_module_input: Dict[str, List[Tuple]] = PrivateAttr(default_factory=dict)
@@ -215,15 +248,20 @@ def apply_autoround(self, state, subgraph):
         wrapped_model = _wrap_decoding_layer(decoding_layer)
         wrapped_model.name_or_path = state.model.name_or_path
 
-        with torch.enable_grad(), align_module_device(decoding_layer):
+        with torch.enable_grad(), align_module_device(
+            decoding_layer
+        ), suspend_accelerate_hooks(wrapped_model):
             ar_quant_scheme = self._mapping_config_to_autoround()
+            fp_layers = self.get_unquantized_layer_names(decoding_layer)
             ar = AutoRound(
                 model=wrapped_model,
                 tokenizer="",
                 scheme=ar_quant_scheme,
                 iters=self.iters,
                 enable_torch_compile=self.enable_torch_compile,
                 batch_size=self.batch_size,
+                device_map=self.device_map,
+                fp_layers=",".join(fp_layers) if fp_layers else "",
             )
             # TODO: configure layer-wise config based on self.resolved_config
             ar.configure_layer_config(enable_gguf_official_mixed=False)
@@ -239,14 +277,12 @@ def apply_autoround(self, state, subgraph):
                 q_input=self._q_input,
                 device=str(device),
                 # Leave offload for LLMC
-                # Leave offload for LLMC
+                # Use auto_round's internal offloading
-                # Leave offload for LLMC
+                # Use auto_round's internal offloading
-                auto_offload=False,
+                auto_offload=True,
             )
             self._q_input = q_input
             # Update offload parameters and remove temporary attributes
             for _, module in decoding_layer.named_modules():
-                if hasattr(module, "weight_scale") and hasattr(
-                    module, "weight_zero_point"
-                ):
+                if hasattr(module, "scale") and hasattr(module, "weight_zero_point"):
                     # Note: The model's weight is already q-dq in-place by auto-round.
                     weight_scale = module.scale
                     del module.scale
@@ -278,6 +314,17 @@ def on_finalize(self, state: State, **kwargs) -> bool:
 
         return True
 
+    def get_unquantized_layer_names(self, wrapped_model) -> List[str]:
+        unquantized_layers = []
+
+        for name, module in wrapped_model.named_modules():
+            if (
+                module.__class__.__name__ in self.resolved_targets
+                and getattr(module, "quantization_scheme", None) is None
+            ):
+                unquantized_layers.append(name)
+        return unquantized_layers
+
     def _add_temporary_names(self, model: torch.nn.Module):
         for name, mod in model.named_modules():
             mod._tmp_name = name