Skip to content

Commit fb9e3ce

Browse files
authored
Merge branch 'main' into qwen3VLMoE_lineared
2 parents 15da5c6 + b06bf56 commit fb9e3ce

File tree

11 files changed

+47
-35
lines changed

11 files changed

+47
-35
lines changed

examples/transform/quip_example.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
# * apply quip transforms to model in order to make quantization easier
2222
# * quantize the weights to 4 bit with a group size 128
2323
recipe = [
24-
QuIPModifier(rotations=["v", "u"], transform_type="random-hadamard"),
24+
QuIPModifier(
25+
rotations=["v", "u"], transform_block_size=128, transform_type="random-hadamard"
26+
),
2527
QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
2628
]
2729

examples/transform/spinquant_example.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
1212
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
1313

14-
# NOTE: currently only fused rotations (R1 & R2) are available
15-
# Learned rotations and online rotations (R3 & R4) will be added
16-
# in a future release.
14+
# NOTE: currently only rotations R1, R2, and R4 are available
15+
# R3 and learned R1/R2 rotations will be added in a future release.
1716
# Configure the quantization algorithm to run.
1817
# * apply spinquant transforms to model to reduce quantization loss
1918
# * quantize the weights to 4 bit with group size 128
2019
recipe = [
21-
SpinQuantModifier(rotations=["R1", "R2"], transform_type="hadamard"),
20+
SpinQuantModifier(
21+
rotations=["R1", "R2", "R4"], transform_block_size=64, transform_type="hadamard"
22+
),
2223
QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
2324
]
2425

@@ -37,6 +38,6 @@
3738
print("==========================================\n\n")
3839

3940
# Save to disk compressed.
40-
SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2-w4a16"
41+
SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2R4-w4a16"
4142
model.save_pretrained(SAVE_DIR, save_compressed=True)
4243
tokenizer.save_pretrained(SAVE_DIR)

src/llmcompressor/entrypoints/train.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import math
1111
import os
1212

13+
from compressed_tensors.utils import deprecated
1314
from loguru import logger
1415
from transformers import PreTrainedModel
1516

@@ -22,6 +23,13 @@
2223
from .utils import post_process, pre_process
2324

2425

26+
@deprecated(
27+
message=(
28+
"Training support will be removed in future releases. Please use "
29+
"the llmcompressor Axolotl integration for fine-tuning "
30+
"https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open" # noqa: E501
31+
)
32+
)
2533
def train(**kwargs) -> PreTrainedModel:
2634
"""
2735
Fine-tuning entrypoint that supports vanilla fine-tuning and

src/llmcompressor/modifiers/awq/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,10 @@ class AWQModifier(Modifier, QuantizationMixin):
139139
default_factory=dict
140140
)
141141

142+
# NOTE: different name chosen to avoid collision with
143+
# QuantizationMixin.validate_model_after, which must be called first
142144
@model_validator(mode="after")
143-
def validate_model_after(model: "AWQModifier") -> "AWQModifier":
145+
def validate_awq_after(model: "AWQModifier") -> "AWQModifier":
144146
"""
145147
Confirm only one configuration for group_size, symmetric, and num_bits,
146148
as AWQ algorithm depends on it

src/llmcompressor/modifiers/transform/quip/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def _create_config(self) -> TransformConfig:
135135
def _create_v_scheme(self) -> TransformScheme:
136136
return TransformScheme(
137137
type=self.transform_type,
138-
block_size=self.transform_block_size,
138+
head_dim=self.transform_block_size,
139139
apply=[
140140
TransformArgs(
141141
targets=self.targets,
@@ -157,7 +157,7 @@ def _create_v_scheme(self) -> TransformScheme:
157157
def _create_u_scheme(self) -> TransformScheme:
158158
return TransformScheme(
159159
type=self.transform_type,
160-
block_size=self.transform_block_size,
160+
head_dim=self.transform_block_size,
161161
apply=[
162162
TransformArgs(
163163
targets=self.targets,

src/llmcompressor/modifiers/transform/spinquant/base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def _create_r1_scheme(self) -> TransformScheme:
193193
randomize=self.randomize,
194194
requires_grad=self.learnable,
195195
precision=self.precision,
196-
block_size=self.transform_block_size,
196+
head_dim=self.transform_block_size,
197197
apply=[
198198
TransformArgs(
199199
targets=[
@@ -240,7 +240,7 @@ def _create_r2_scheme(self, model: PreTrainedModel) -> TransformScheme:
240240
randomize=self.randomize,
241241
requires_grad=self.learnable,
242242
precision=self.precision,
243-
block_size=head_dim,
243+
head_dim=head_dim,
244244
apply=[
245245
TransformArgs(targets=[self.mappings.attn_v], location="weight_output"),
246246
TransformArgs(
@@ -262,7 +262,7 @@ def _create_r4_scheme(self) -> TransformScheme:
262262
randomize=self.randomize,
263263
requires_grad=self.learnable,
264264
precision=self.precision,
265-
block_size=self.transform_block_size,
265+
head_dim=self.transform_block_size,
266266
apply=[
267267
TransformArgs(
268268
targets=[*self.mappings.mlp_out],
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
cadence: "nightly"
22
test_type: "regression"
33
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4-
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml
5-
dataset_id: "mit-han-lab/pile-val-backup"
6-
dataset_split: validation
7-
num_calibration_samples: 2000
4+
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
5+
dataset_id: HuggingFaceH4/ultrachat_200k
6+
dataset_split: train_sft
87
scheme: W4A16_weight_asym_awq
98
save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cadence: "nightly"
2+
test_type: "regression"
3+
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4+
recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
5+
dataset_id: HuggingFaceH4/ultrachat_200k
6+
dataset_split: train_sft
7+
scheme: W4A16_weight_sym_awq
8+
save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq

tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml renamed to tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml

File renamed without changes.

tests/llmcompressor/transformers/compression/test_quantization.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -145,15 +145,19 @@ def test_perplexity(setup_model_and_config):
145145
dispatch_for_generation(model)
146146

147147
total_ppl = 0.0
148-
total_non_nan = 0
149-
for idx, sample in enumerate(dataloader):
150-
if idx >= config["num_eval"]:
148+
total_samples = 0
149+
for sample in dataloader:
150+
if total_samples >= config["num_eval"]:
151151
break
152-
output = model(**tensors_to_device(sample, "cuda:0"))
153-
if torch.isnan(output.loss):
152+
# -100 in labels indicates that the token is not part of the loss calculation
153+
pct_labels_in_sample = (sample["labels"] != -100).to(torch.float).mean().item()
154+
if pct_labels_in_sample <= 0.25:
155+
# At least 25% of the tokens in the sample must be part of loss calculation
156+
# otherwise the perplexity is too volatile and can skew the results
154157
continue
158+
output = model(**tensors_to_device(sample, "cuda:0"))
155159
total_ppl += torch.exp(output.loss).item()
156-
total_non_nan += 1
160+
total_samples += 1
157161

158-
avg_ppl = total_ppl / total_non_nan
162+
avg_ppl = total_ppl / total_samples
159163
assert avg_ppl <= config["ppl_threshold"]

0 commit comments

Comments
 (0)