vllm-project · dzhengAP · Mar 10, 2026 · gemini-code-assist · Mar 10, 2026 · gemini-code-assist
diff --git a/examples/awq/README.md b/examples/awq/README.md
@@ -6,14 +6,33 @@ The AWQ implementation found in LLM Compressor is derived from the pioneering wo
 
 ## AWQ Recipe ##
 
-The AWQ recipe has been inferfaced as follows, where the `AWQModifier` adjusts model scales ahead of efficient weight quantization by the `QuantizationModifier`
+`AWQModifier` is a smoothing pre-pass (similar to `SmoothQuantModifier`). It adjusts model scales ahead of weight quantization but does not apply quantization itself. It must be stacked with a downstream quantization modifier:
+
+### AWQ + QuantizationModifier (RTN) ###
 
 ```python
 recipe = [
     AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
-    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),
-    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),
+    QuantizationModifier(scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]),
 ]
 ```
 
+See [`llama_example.py`](llama_example.py) for a full runnable example.
+
+### AWQ + GPTQModifier (higher accuracy) ###
+
+```python
+recipe = [
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
-    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),
-    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
+    AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),
+    GPTQModifier(scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]),
+]
+```
+
+See [`llama_gptq_example.py`](llama_gptq_example.py) for a full runnable example.
+
+> **Note**: The `scheme`, `targets`, and `ignore` arguments on `AWQModifier` are used
+> internally during scale search and should match those on the downstream quantization modifier.
+
 ## Compressing Your Own Model ##
 To use your own model, start with an existing example change the `model_id` to match your own model stub.
 ```python

diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,10 +50,16 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier is a smoothing pre-pass: it computes and applies per-channel
+# activation scales but does NOT quantize weights itself.
+# QuantizationModifier performs the actual weight quantization using those scales.
 recipe = [
     AWQModifier(
         ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
     ),
+    QuantizationModifier(
+        scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]
+    ),
 ]
 
 # Apply algorithms.

diff --git a/examples/awq/llama_gptq_example.py b/examples/awq/llama_gptq_example.py
@@ -0,0 +1,89 @@
+from compressed_tensors.offload import dispatch_model
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import GPTQModifier
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+# AWQModifier is a smoothing pre-pass: it computes and applies per-channel
+# activation scales but does NOT quantize weights itself.
+# GPTQModifier performs Hessian-based weight quantization on the smoothed model,
+# yielding higher accuracy than RTN at the cost of longer calibration time.
+recipe = [
+    AWQModifier(
+        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
+    ),
+    GPTQModifier(
+        scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]
+    ),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-gptq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)