Skip to content

Commit 1690a4c

Browse files
authored
GPTQ Actorder Refactor (#2541)
Summary: this code was really spaghetti before, tried to clean it up and make it a little more readable. clean up actorder if tower no longer saving random values we don't need (g_idx that hasn't been permuted) update tests to actually test actorder make tests faster (smaller model + don't tokenize whole dataset) note: smaller model required me to change groupsize to 32 TEST PLAN: pytest -vs -rs /home/HDCharles/repos/llm-compressor/tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py --------- Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
1 parent 01b6405 commit 1690a4c

File tree

2 files changed

+105
-21
lines changed

2 files changed

+105
-21
lines changed

src/llmcompressor/modifiers/gptq/gptq_quantize.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
QuantizationStrategy,
1010
fake_quantize,
1111
)
12-
from compressed_tensors.utils import update_offload_parameter
1312
from loguru import logger
1413

1514
from llmcompressor.modifiers.utils import SPARSITY_THRESHOLD
@@ -110,7 +109,8 @@ def quantize_weight(
110109
num_rows = W.shape[0]
111110
num_columns = W.shape[1]
112111

113-
# generate scale, should include tensor group / use global scale
112+
scale, zero_point = observer(W)
113+
# handle g_idx and activation ordering
114114
if strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
115115
# mapping from column index to group index
116116
g_idx = (
@@ -119,26 +119,16 @@ def quantize_weight(
119119
)
120120

121121
if actorder == ActivationOrdering.GROUP:
122-
# permute by activation order first, then update groups
123122
W, H, perm = _apply_activation_ordering(W, H)
124-
update_offload_parameter(module, "weight_g_idx", g_idx)
123+
# actually need scale/zp for permuted weight for this format
125124
scale, zero_point = observer(W)
126-
127125
# use identity g_idx (invert permutation later)
128126

129127
elif actorder == ActivationOrdering.WEIGHT:
130-
# update groups first, then permute by activation order
131-
scale, zero_point = observer(W)
128+
# permute weights and g_idx
132129
W, H, perm = _apply_activation_ordering(W, H)
133-
134-
# permute g_idx to maintain identity mapping after unpermutation
135130
g_idx = g_idx[perm]
136131

137-
else:
138-
scale, zero_point = observer(W)
139-
else:
140-
scale, zero_point = observer(W)
141-
142132
# sparsity mask
143133
sparsity = tensor_sparsity(W)
144134
preserve_zeros = sparsity >= SPARSITY_THRESHOLD

tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py

Lines changed: 101 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import pytest
22
import torch
3-
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
4-
from transformers import AutoModelForCausalLM
3+
from compressed_tensors.quantization import (
4+
ActivationOrdering,
5+
QuantizationArgs,
6+
QuantizationScheme,
7+
)
8+
from transformers import AutoModelForCausalLM, AutoTokenizer
59

610
from llmcompressor import oneshot
711
from llmcompressor.modifiers.gptq import GPTQModifier
@@ -36,17 +40,60 @@
3640
config_groups={
3741
"group_0": QuantizationScheme(
3842
targets=["re:.*model.layers.2.self_attn.q_proj$"],
39-
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
43+
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
4044
)
4145
},
4246
)
4347

4448
recipe_modifier_shorthand_a = GPTQModifier(
45-
ignore=["lm_head"], targets="re:.*model.layers.2.self_attn.q_proj$", scheme="W4A16"
49+
ignore=["lm_head"],
50+
config_groups={
51+
"group_0": QuantizationScheme(
52+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
53+
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
54+
)
55+
},
4656
)
4757

4858
recipe_modifier_shorthand_b = GPTQModifier(
49-
ignore=["lm_head"], scheme={"W4A16": ["re:.*model.layers.2.self_attn.q_proj$"]}
59+
ignore=["lm_head"],
60+
config_groups={
61+
"group_0": QuantizationScheme(
62+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
63+
weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
64+
)
65+
},
66+
)
67+
68+
# Test activation ordering variants
69+
recipe_modifier_group_actorder_weight = GPTQModifier(
70+
ignore=["lm_head"],
71+
config_groups={
72+
"group_0": QuantizationScheme(
73+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
74+
weights=QuantizationArgs(
75+
num_bits=4,
76+
strategy="group",
77+
group_size=32,
78+
actorder=ActivationOrdering.WEIGHT,
79+
),
80+
)
81+
},
82+
)
83+
84+
recipe_modifier_group_actorder_group = GPTQModifier(
85+
ignore=["lm_head"],
86+
config_groups={
87+
"group_0": QuantizationScheme(
88+
targets=["re:.*model.layers.2.self_attn.q_proj$"],
89+
weights=QuantizationArgs(
90+
num_bits=4,
91+
strategy="group",
92+
group_size=32,
93+
actorder=ActivationOrdering.GROUP,
94+
),
95+
)
96+
},
5097
)
5198

5299

@@ -58,20 +105,38 @@
58105
recipe_modifier_full_group,
59106
recipe_modifier_shorthand_a,
60107
recipe_modifier_shorthand_b,
108+
recipe_modifier_group_actorder_weight,
109+
recipe_modifier_group_actorder_group,
61110
],
62111
)
63112
def test_oneshot_application(recipe, tmp_path):
64113
output = tmp_path / "oneshot_output"
65-
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
114+
model_id = "nm-testing/tinysmokellama-3.2"
66115
dataset = "open_platypus"
67116
device = "cuda:0" if torch.cuda.is_available() else "cpu"
68117

118+
# Load original model for numerical comparison
119+
original_model = AutoModelForCausalLM.from_pretrained(
120+
model_id, torch_dtype=torch.float16, device_map=device
121+
)
122+
tokenizer = AutoTokenizer.from_pretrained(model_id)
123+
124+
# Create test input
125+
test_text = "The quick brown fox jumps over the lazy dog"
126+
inputs = tokenizer(test_text, return_tensors="pt").to(device)
127+
128+
# Get original model output
129+
with torch.no_grad():
130+
original_output = original_model(**inputs).logits
131+
132+
# Quantize model
69133
oneshot(
70-
model=model,
134+
model=model_id,
71135
dataset=dataset,
72136
output_dir=output,
73137
recipe=recipe,
74138
num_calibration_samples=9,
139+
splits={"calibration": "train[:9]"},
75140
)
76141
model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)
77142

@@ -98,3 +163,32 @@ def test_oneshot_application(recipe, tmp_path):
98163
# Check lm-head is not quantized
99164
not_targetted = model_loaded.lm_head
100165
assert not hasattr(not_targetted, "quantization_scheme")
166+
167+
# Verify g_idx behavior for activation ordering
168+
if weight_args.actorder == ActivationOrdering.GROUP:
169+
# GROUP actorder should save g_idx
170+
assert hasattr(
171+
targetted_linear_layer, "weight_g_idx"
172+
), "GROUP actorder should have g_idx"
173+
elif weight_args.actorder == ActivationOrdering.WEIGHT:
174+
# WEIGHT actorder should NOT save g_idx (identity mapping)
175+
assert not hasattr(
176+
targetted_linear_layer, "weight_g_idx"
177+
), "WEIGHT actorder should not have g_idx"
178+
179+
# Numerical validation: check MSE
180+
with torch.no_grad():
181+
quantized_output = model_loaded(**inputs).logits
182+
183+
mse = torch.nn.functional.mse_loss(quantized_output, original_output).item()
184+
185+
# MSE threshold - quantization should not degrade quality too much
186+
mse_threshold = 0.015
187+
assert mse < mse_threshold, (
188+
f"MSE {mse:.6f} exceeds threshold {mse_threshold}. "
189+
f"Quantization degraded model quality too much."
190+
)
191+
192+
# Cleanup
193+
del original_model, model_loaded
194+
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)