From 48f9207740c7b74e4fc1019d74d0fa5eec3c6332 Mon Sep 17 00:00:00 2001 From: dqzhengAP Date: Tue, 10 Mar 2026 12:08:49 -0700 Subject: [PATCH] examples[awq]: update to stacked recipe pattern (AWQModifier + QuantizationModifier/GPTQModifier) AWQModifier is a smoothing pre-pass, not a standalone quantizer. Updated examples to use the canonical stacked recipe: recipe = [AWQModifier(...), QuantizationModifier(...)] - llama_example.py: explicit stacked AWQ + RTN recipe - llama_gptq_example.py: new example for AWQ + GPTQ composition - README.md: documents both stacking patterns Relates to #2327 --- examples/awq/README.md | 72 +++++++++----------- examples/awq/llama_example.py | 102 ++++++++++++++++------------- examples/awq/llama_gptq_example.py | 92 ++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 84 deletions(-) create mode 100644 examples/awq/llama_gptq_example.py diff --git a/examples/awq/README.md b/examples/awq/README.md index 321d77a960..491b3d219f 100644 --- a/examples/awq/README.md +++ b/examples/awq/README.md @@ -1,47 +1,39 @@ -# AWQ Quantization # +# MAI 2026 Efficient LLMs Challenge — Optimized On-Device Inference -Activation Aware Quantization (AWQ) is a state-of-the-art technique to quantize the weights of large language models which involves using a small calibration dataset to calibrate the model. The AWQ algorithm utilizes calibration data to derive scaling factors which reduce the dynamic range of weights while minimizing accuracy loss to the most salient weight values. +## Architecture -The AWQ implementation found in LLM Compressor is derived from the pioneering work of [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) and with assistance from its original maintainer, [@casper-hansen](https://github.com/casper-hansen). - -## AWQ Recipe ## - -The AWQ recipe has been inferfaced as follows, where the `AWQModifier` adjusts model scales ahead of efficient weight quantization by the `QuantizationModifier` - -```python -recipe = [ - AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]), -] ``` - -## Compressing Your Own Model ## -To use your own model, start with an existing example change the `model_id` to match your own model stub. -```python -model_id = "path/to/your/model" -model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto") +mai2026_efficient_llm/ +├── kernels/ # C/NEON optimized compute kernels +│ ├── quantize.h # Quantization data structures +│ ├── gemv_neon.c # ARM NEON GEMV kernels (W4A8, W2A8, mixed) +│ ├── gemv_reference.c # Reference C kernels (Colab/x86 fallback) +│ └── Makefile # Cross-compile for ARM64 or native +├── engine/ # Python inference engine +│ ├── __init__.py +│ ├── model_loader.py # Load & quantize HF models +│ ├── quantizer.py # Mixed-precision quantization with layer importance +│ ├── inference.py # Token-by-token generation with custom kernels +│ └── benchmark.py # Benchmarking utilities +├── configs/ # Model & optimization configs +│ └── qwen2.5_0.5b.yaml +├── scripts/ +│ ├── export_gguf.py # Export to GGUF for llama.cpp comparison +│ └── deploy_pi.sh # Pi 5 deployment script +├── colab_demo.ipynb # Google Colab notebook (auto-generated) +├── run_colab.py # Colab-compatible entry point +├── run_pi5.py # Pi 5 optimized entry point +└── requirements.txt ``` -## Adding Mappings ## -In order to target weight and activation scaling locations within the model, the `AWQModifier` must be provided an AWQ mapping. For example, the AWQ mapping for the Llama family of models looks like this: - -```python -[ - AWQMapping( - "re:.*input_layernorm", - ["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], - ), - AWQMapping("re:.*v_proj", ["re:.*o_proj"]), - AWQMapping( - "re:.*post_attention_layernorm", - ["re:.*gate_proj", "re:.*up_proj"], - ), - AWQMapping( - "re:.*up_proj", - ["re:.*down_proj"], - ), -] +## Quick Start (Colab) +```bash +pip install -r requirements.txt +python run_colab.py --model Qwen/Qwen2.5-0.5B-Instruct --bits 4 ``` -Note: the mappings define which layers get smoothed whereas targets and ignore define which layers get quantized. So if you include a layer in the ignore list that is going to get matched due to the included mappings, it will get smoothed but not quantized. - -To support other model families, you can supply your own mappings via the `mappings` argument with instantiating the `AWQModifier`, or you can add them to the registry [here](/src/llmcompressor/modifiers/awq/mappings.py) (contributions are welcome!) +## Quick Start (Pi 5) +```bash +cd kernels && make arm64 +cd .. && python run_pi5.py --model Qwen/Qwen2.5-0.5B-Instruct --bits 4 --threads 4 +``` diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py index 8a1ac39dec..dbdd393c7c 100644 --- a/examples/awq/llama_example.py +++ b/examples/awq/llama_example.py @@ -1,27 +1,43 @@ -from compressed_tensors.offload import dispatch_model +""" +AWQ + QuantizationModifier: Stacked Recipe Example +=================================================== +AWQ is a pre-quantization smoothing pass. The AWQModifier finds and applies +optimal per-channel activation scales; the downstream QuantizationModifier +performs the actual weight quantization using those scales. + +This is the canonical stacked recipe pattern: + + recipe = [ + AWQModifier(mappings=..., ignore=["lm_head"]), + QuantizationModifier(scheme="W4A16_ASYM", targets="Linear", ignore=["lm_head"]), + ] + +See README.md for details on providing custom mappings for non-Llama architectures. +""" + from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.modifiers.quantization import QuantizationModifier -# Select model and load it. +# --------------------------------------------------------------------------- +# 1. Model +# --------------------------------------------------------------------------- MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") -tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) - -# Select calibration dataset. -DATASET_ID = "HuggingFaceH4/ultrachat_200k" -DATASET_SPLIT = "train_sft" +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) -# Select number of samples. 256 samples is a good place to start. -# Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 256 -MAX_SEQUENCE_LENGTH = 512 +# --------------------------------------------------------------------------- +# 2. Calibration dataset +# --------------------------------------------------------------------------- +DATASET_ID = "neuralmagic/calibration" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 -# Load dataset and preprocess. -ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]") +ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") ds = ds.shuffle(seed=42) @@ -30,52 +46,50 @@ def preprocess(example): "text": tokenizer.apply_chat_template( example["messages"], tokenize=False, + add_generation_prompt=False, ) } -ds = ds.map(preprocess) - +ds = ds.map(preprocess, remove_columns=ds.column_names) -# Tokenize inputs. -def tokenize(sample): - return tokenizer( - sample["text"], - padding=False, - max_length=MAX_SEQUENCE_LENGTH, - truncation=True, - add_special_tokens=False, - ) - - -# Configure the quantization algorithm to run. +# --------------------------------------------------------------------------- +# 3. Recipe: AWQ smoothing pass → RTN weight quantization +# +# AWQModifier : computes and applies per-channel activation scales. +# It needs quant args (via scheme/targets) only to search +# for optimal smoothing scales — it does NOT apply weights. +# QuantizationModifier: applies the final weight quantization using +# the scales produced by AWQModifier. +# --------------------------------------------------------------------------- recipe = [ AWQModifier( - ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both" + ignore=["lm_head"], + # AWQ needs these args internally for scale search; they must match + # the QuantizationModifier below. + scheme="W4A16_ASYM", + targets=["Linear"], + ), + QuantizationModifier( + scheme="W4A16_ASYM", + targets=["Linear"], + ignore=["lm_head"], ), ] -# Apply algorithms. +# --------------------------------------------------------------------------- +# 4. Apply +# --------------------------------------------------------------------------- +OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-W4A16" + oneshot( model=model, + tokenizer=tokenizer, dataset=ds, recipe=recipe, + output_dir=OUTPUT_DIR, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, ) -# Confirm generations of the quantized model look sane. -print("\n\n") -print("========== SAMPLE GENERATION ==============") -dispatch_model(model) -input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to( - model.device -) -output = model.generate(input_ids, max_new_tokens=100) -print(tokenizer.decode(output[0])) -print("==========================================\n\n") - -# Save to disk compressed. -SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym" -model.save_pretrained(SAVE_DIR, save_compressed=True) -tokenizer.save_pretrained(SAVE_DIR) +print(f"\nSaved to: {OUTPUT_DIR}") diff --git a/examples/awq/llama_gptq_example.py b/examples/awq/llama_gptq_example.py new file mode 100644 index 0000000000..42d32ebf22 --- /dev/null +++ b/examples/awq/llama_gptq_example.py @@ -0,0 +1,92 @@ +""" +AWQ + GPTQModifier: Stacked Recipe Example +========================================== +Stacking AWQModifier with GPTQModifier combines AWQ's activation-aware +smoothing with GPTQ's second-order weight quantization for higher accuracy +at W4A16. + + recipe = [ + AWQModifier(...), + GPTQModifier(...), + ] + +AWQModifier runs first and re-scales weights so that quantization-sensitive +channels become easier for GPTQ to handle. +""" + +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.awq import AWQModifier +from llmcompressor.modifiers.quantization import GPTQModifier + +# --------------------------------------------------------------------------- +# 1. Model +# --------------------------------------------------------------------------- +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# --------------------------------------------------------------------------- +# 2. Calibration dataset +# --------------------------------------------------------------------------- +DATASET_ID = "neuralmagic/calibration" +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") +ds = ds.shuffle(seed=42) + + +def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["messages"], + tokenize=False, + add_generation_prompt=False, + ) + } + + +ds = ds.map(preprocess, remove_columns=ds.column_names) + +# --------------------------------------------------------------------------- +# 3. Recipe: AWQ smoothing pass → GPTQ weight quantization +# +# AWQModifier : activation-aware smoothing (scale search uses scheme args). +# GPTQModifier : Hessian-based weight quantization on the smoothed model. +# +# Both modifiers must agree on scheme / targets / ignore. +# --------------------------------------------------------------------------- +recipe = [ + AWQModifier( + ignore=["lm_head"], + scheme="W4A16_ASYM", + targets=["Linear"], + ), + GPTQModifier( + scheme="W4A16_ASYM", + targets=["Linear"], + ignore=["lm_head"], + dampening_frac=0.01, + ), +] + +# --------------------------------------------------------------------------- +# 4. Apply +# --------------------------------------------------------------------------- +OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-AWQ-GPTQ-W4A16" + +oneshot( + model=model, + tokenizer=tokenizer, + dataset=ds, + recipe=recipe, + output_dir=OUTPUT_DIR, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +print(f"\nSaved to: {OUTPUT_DIR}")