Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion examples/awq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,33 @@ The AWQ implementation found in LLM Compressor is derived from the pioneering wo

## AWQ Recipe ##

The AWQ recipe has been inferfaced as follows, where the `AWQModifier` adjusts model scales ahead of efficient weight quantization by the `QuantizationModifier`
`AWQModifier` is a smoothing pre-pass (similar to `SmoothQuantModifier`). It adjusts model scales ahead of weight quantization but does not apply quantization itself. It must be stacked with a downstream quantization modifier:

### AWQ + QuantizationModifier (RTN) ###

```python
recipe = [
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The AWQModifier in this example recipe is missing the duo_scaling="both" argument, which is present in the corresponding llama_example.py file. For consistency and to showcase a more complete example, it would be beneficial to include it here.

Suggested change
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),

QuantizationModifier(scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]),
]
```

See [`llama_example.py`](llama_example.py) for a full runnable example.

### AWQ + GPTQModifier (higher accuracy) ###

```python
recipe = [
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to the previous example, the AWQModifier here is missing the duo_scaling="both" argument which is present in llama_gptq_example.py. Adding it would improve consistency between the documentation and the example code.

Suggested change
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"]),
AWQModifier(ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"),

GPTQModifier(scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]),
]
```

See [`llama_gptq_example.py`](llama_gptq_example.py) for a full runnable example.

> **Note**: The `scheme`, `targets`, and `ignore` arguments on `AWQModifier` are used
> internally during scale search and should match those on the downstream quantization modifier.

## Compressing Your Own Model ##
To use your own model, start with an existing example change the `model_id` to match your own model stub.
```python
Expand Down
7 changes: 7 additions & 0 deletions examples/awq/llama_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down Expand Up @@ -49,10 +50,16 @@ def tokenize(sample):


# Configure the quantization algorithm to run.
# AWQModifier is a smoothing pre-pass: it computes and applies per-channel
# activation scales but does NOT quantize weights itself.
# QuantizationModifier performs the actual weight quantization using those scales.
recipe = [
AWQModifier(
ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
),
QuantizationModifier(
scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]
),
]

# Apply algorithms.
Expand Down
89 changes: 89 additions & 0 deletions examples/awq/llama_gptq_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from compressed_tensors.offload import dispatch_model
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import GPTQModifier

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 256 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


# Configure the quantization algorithm to run.
# AWQModifier is a smoothing pre-pass: it computes and applies per-channel
# activation scales but does NOT quantize weights itself.
# GPTQModifier performs Hessian-based weight quantization on the smoothed model,
# yielding higher accuracy than RTN at the cost of longer calibration time.
recipe = [
AWQModifier(
ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
),
GPTQModifier(
scheme="W4A16_ASYM", targets=["Linear"], ignore=["lm_head"]
),
]

# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_model(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
model.device
)
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-gptq-asym"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
Loading