Skip to content

Commit 28ecfec

Browse files
committed
Merge branch 'kylesayrs/revert-kv-cache-tests' into kylesayrs/quantization-mixin
2 parents 5b3e5eb + b3aa8d4 commit 28ecfec

File tree

38 files changed

+1259
-186
lines changed

38 files changed

+1259
-186
lines changed

.github/workflows/set-comment.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
name: PR Reminder Comment Bot
22
on:
3-
pull_request:
4-
branches:
5-
- main
3+
pull_request_target:
4+
branches: [main]
65
types: [opened]
76

87
jobs:

.github/workflows/test-check-transformers.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,17 +51,24 @@ jobs:
5151
with:
5252
python-version: '3.9'
5353
- uses: actions/checkout@v4
54+
with:
55+
fetch-depth: 0
56+
fetch-tags: true
5457
- name: "⚙️ Install dependencies"
5558
run: pip3 install -U pip setuptools && pip3 install .[dev]
5659
- uses: actions/checkout@v4
5760
with:
5861
repository: "neuralmagic/compressed-tensors"
5962
path: "compressed-tensors"
63+
fetch-depth: 0
64+
fetch-tags: true
6065
- name: "⚙️ Install compressed-tensors dependencies"
6166
id: install
6267
run: |
63-
pip3 uninstall -y compressed-tensors compressed-tensors-nightly
64-
pip3 install ./compressed-tensors/
68+
pip3 uninstall -y compressed-tensors
69+
export GIT_CEILING_DIRECTORIES="$(pwd)"
70+
cd compressed-tensors
71+
BUILD_TYPE=nightly pip3 install .
6572
- name: "Clean compressed-tensors directory"
6673
run: rm -r compressed-tensors/
6774
- name: "🔬 Running transformers tests"

.github/workflows/test-check.yaml

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44

55
env:
66
CADENCE: "commit"
7-
7+
88
jobs:
99

1010
base-tests:
@@ -14,16 +14,23 @@ jobs:
1414
with:
1515
python-version: '3.12'
1616
- uses: actions/checkout@v4
17+
with:
18+
fetch-depth: 0
19+
fetch-tags: true
1720
- name: "⚙️ Install dependencies"
1821
run: pip3 install -U pip setuptools && pip3 install .[dev]
1922
- uses: actions/checkout@v4
2023
with:
2124
repository: "neuralmagic/compressed-tensors"
2225
path: "compressed-tensors"
26+
fetch-depth: 0
27+
fetch-tags: true
2328
- name: "⚙️ Install compressed-tensors dependencies"
2429
run: |
25-
pip3 uninstall -y compressed-tensors compressed-tensors-nightly
26-
pip3 install ./compressed-tensors/
30+
pip3 uninstall -y compressed-tensors
31+
export GIT_CEILING_DIRECTORIES="$(pwd)"
32+
cd compressed-tensors
33+
BUILD_TYPE=nightly pip3 install .
2734
- name: "Clean compressed-tensors directory"
2835
run: rm -r compressed-tensors/
2936
- name: "🔬 Running base tests"
@@ -36,16 +43,23 @@ jobs:
3643
with:
3744
python-version: '3.11'
3845
- uses: actions/checkout@v4
46+
with:
47+
fetch-depth: 0
48+
fetch-tags: true
3949
- name: "⚙️ Install dependencies"
4050
run: pip3 install -U pip setuptools && pip3 install .[dev]
4151
- uses: actions/checkout@v4
4252
with:
4353
repository: "neuralmagic/compressed-tensors"
4454
path: "compressed-tensors"
55+
fetch-depth: 0
56+
fetch-tags: true
4557
- name: "⚙️ Install compressed-tensors dependencies"
4658
run: |
47-
pip3 uninstall -y compressed-tensors compressed-tensors-nightly
48-
pip3 install ./compressed-tensors/
59+
pip3 uninstall -y compressed-tensors
60+
export GIT_CEILING_DIRECTORIES="$(pwd)"
61+
cd compressed-tensors
62+
BUILD_TYPE=nightly pip3 install .
4963
- name: "Clean compressed-tensors directory"
5064
run: rm -r compressed-tensors/
5165
- name: "🔬 Running pytorch tests"
@@ -59,16 +73,23 @@ jobs:
5973
with:
6074
python-version: '3.10'
6175
- uses: actions/checkout@v4
76+
with:
77+
fetch-depth: 0
78+
fetch-tags: true
6279
- name: "⚙️ Install dependencies"
6380
run: pip3 install -U pip setuptools && pip3 install .[dev]
6481
- uses: actions/checkout@v4
6582
with:
6683
repository: "neuralmagic/compressed-tensors"
6784
path: "compressed-tensors"
85+
fetch-depth: 0
86+
fetch-tags: true
6887
- name: "⚙️ Install compressed-tensors dependencies"
6988
run: |
70-
pip3 uninstall -y compressed-tensors compressed-tensors-nightly
71-
pip3 install ./compressed-tensors/
89+
pip3 uninstall -y compressed-tensors
90+
export GIT_CEILING_DIRECTORIES="$(pwd)"
91+
cd compressed-tensors
92+
BUILD_TYPE=nightly pip3 install .
7293
- name: "Clean compressed-tensors directory"
7394
run: rm -r compressed-tensors/
7495
- name: "🔬 Running pytorch tests"

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
### Supported Algorithms
2121
* Simple PTQ
2222
* GPTQ
23+
* AWQ
2324
* SmoothQuant
2425
* SparseGPT
2526

@@ -41,7 +42,8 @@ pip install llmcompressor
4142
Applying quantization with `llmcompressor`:
4243
* [Activation quantization to `int8`](examples/quantization_w8a8_int8/README.md)
4344
* [Activation quantization to `fp8`](examples/quantization_w8a8_fp8/README.md)
44-
* [Weight only quantization to `int4`](examples/quantization_w4a16/README.md)
45+
* [Weight only quantization to `int4` using GPTQ](examples/quantization_w4a16/README.md)
46+
* [Weight only quantization to `int4` using AWQ](examples/awq/awq_one_shot.py)
4547
* [Quantizing MoE LLMs](examples/quantizing_moe/README.md)
4648
* [Quantizing Vision-Language Models](examples/multimodal_vision/README.md)
4749
* [Quantizing Audio-Language Models](examples/multimodal_audio/README.md)

docs/TODO.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

docs/images/architecture.png

-118 KB
Binary file not shown.

docs/schemes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ PTQ is performed to reduce the precision of quantizable weights (e.g., linear la
55

66
### [W4A16](../examples/quantization_w4a16/README.md)
77
- Uses GPTQ to compress weights to 4 bits. Requires calibration dataset.
8+
- Optionally, [AWQ can also be leveraged for W4A16 quantization](../examples/awq/awq_one_shot.py)
89
- Useful speed ups in low QPS regimes with more weight compression.
910
- Recommended for any GPUs types.
1011

examples/awq/awq_one_shot.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import lm_eval
2+
from compressed_tensors.quantization import (
3+
QuantizationArgs,
4+
QuantizationScheme,
5+
QuantizationStrategy,
6+
QuantizationType,
7+
)
8+
from lm_eval.utils import make_table
9+
from transformers import AutoModelForCausalLM, AutoTokenizer
10+
11+
from llmcompressor import oneshot
12+
from llmcompressor.modifiers.awq import AWQModifier
13+
from llmcompressor.modifiers.quantization import QuantizationModifier
14+
15+
# This example demonstrates how to:
16+
# 1) Run the `llm-compressor` implementation of AWQ
17+
# 2) Evaluate the compressed model with the lm_eval framework
18+
19+
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
20+
DATASET_ID = "mit-han-lab/pile-val-backup"
21+
DATASET_SPLIT = "validation"
22+
NUM_CALIBRATION_SAMPLES = 256
23+
MAX_SEQUENCE_LENGTH = 512
24+
OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
25+
26+
#
27+
# 1) Run LLM Compressor AWQ implementation
28+
#
29+
30+
recipe = [
31+
AWQModifier(bits=4, symmetric=False),
32+
QuantizationModifier(
33+
ignore=["lm_head"],
34+
config_groups={
35+
"group_0": QuantizationScheme(
36+
targets=["Linear"],
37+
weights=QuantizationArgs(
38+
num_bits=4,
39+
type=QuantizationType.INT,
40+
dynamic=False,
41+
symmetric=False,
42+
strategy=QuantizationStrategy.GROUP,
43+
group_size=128,
44+
),
45+
)
46+
},
47+
),
48+
]
49+
50+
model = AutoModelForCausalLM.from_pretrained(
51+
MODEL_ID, device_map="auto", torch_dtype="auto"
52+
)
53+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
54+
55+
56+
def get_calib_dataset(tokenizer):
57+
from datasets import load_dataset
58+
59+
ds = load_dataset(
60+
DATASET_ID,
61+
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*100}]",
62+
)
63+
64+
def preprocess(example):
65+
return {
66+
"input_ids": tokenizer.encode(example["text"].strip())[:MAX_SEQUENCE_LENGTH]
67+
}
68+
69+
ds = (
70+
ds.shuffle(seed=42)
71+
.map(preprocess, remove_columns=ds.column_names)
72+
.filter(lambda example: len(example["input_ids"]) >= MAX_SEQUENCE_LENGTH)
73+
.select(range(NUM_CALIBRATION_SAMPLES))
74+
)
75+
76+
return ds
77+
78+
79+
oneshot(
80+
model=model,
81+
dataset=get_calib_dataset(tokenizer=tokenizer),
82+
recipe=recipe,
83+
output_dir=OUTPUT_DIR,
84+
max_seq_length=MAX_SEQUENCE_LENGTH,
85+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
86+
)
87+
88+
print("Done! model saved to", OUTPUT_DIR)
89+
90+
#
91+
# 2) Evaluate model on wikitext perplexity
92+
#
93+
94+
results = lm_eval.simple_evaluate(
95+
model="vllm",
96+
model_args={
97+
"pretrained": OUTPUT_DIR,
98+
"add_bos_token": True,
99+
"dtype": "bfloat16",
100+
"gpu_memory_utilization": 0.5,
101+
},
102+
tasks=["wikitext"],
103+
num_fewshot=5,
104+
batch_size="auto",
105+
)
106+
print(make_table(results))

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def localversion_func(version: ScmVersion) -> str:
102102
"sparsity, optimization, model optimization, model compression, "
103103
),
104104
license="Apache",
105-
url="https://github.com/neuralmagic/llm-compressor",
105+
url="https://github.com/vllm-project/llm-compressor",
106106
include_package_data=True,
107107
package_dir={"": "src"},
108108
packages=find_packages(
@@ -115,7 +115,7 @@ def localversion_func(version: ScmVersion) -> str:
115115
"requests>=2.0.0",
116116
"tqdm>=4.0.0",
117117
"torch>=1.7.0",
118-
"transformers>4.0,<4.50",
118+
"transformers>4.0,<5.0",
119119
"datasets",
120120
"accelerate>=0.20.3,!=1.1.0",
121121
"pynvml",
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# flake8: noqa
2+
3+
from .base import *
4+
from .mappings import *

0 commit comments

Comments
 (0)