Merge branch 'main' into qwen3VLMoE_lineared

dsikka · web-flow · commit fb9e3ce78cef · 2025-09-30T20:16:32.000-04:00
diff --git a/examples/transform/quip_example.py b/examples/transform/quip_example.py
@@ -21,7 +21,9 @@
 #   * apply quip transforms to model in order to make quantization easier
 #   * quantize the weights to 4 bit with a group size 128
 recipe = [
-    QuIPModifier(rotations=["v", "u"], transform_type="random-hadamard"),
+    QuIPModifier(
+        rotations=["v", "u"], transform_block_size=128, transform_type="random-hadamard"
+    ),
     QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
 
diff --git a/examples/transform/spinquant_example.py b/examples/transform/spinquant_example.py
@@ -11,14 +11,15 @@
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
-# NOTE: currently only fused rotations (R1 & R2) are available
-# Learned rotations and online rotations (R3 & R4) will be added
-# in a future release.
+# NOTE: currently only rotations R1, R2, and R4 are available
+# R3 and learned R1/R2 rotations will be added in a future release.
 # Configure the quantization algorithm to run.
 #   * apply spinquant transforms to model to reduce quantization loss
 #   * quantize the weights to 4 bit with group size 128
 recipe = [
-    SpinQuantModifier(rotations=["R1", "R2"], transform_type="hadamard"),
+    SpinQuantModifier(
+        rotations=["R1", "R2", "R4"], transform_block_size=64, transform_type="hadamard"
+    ),
     QuantizationModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
 ]
 
@@ -37,6 +38,6 @@
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2-w4a16"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-spinquantR1R2R4-w4a16"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py
@@ -10,6 +10,7 @@
 import math
 import os
 
+from compressed_tensors.utils import deprecated
 from loguru import logger
 from transformers import PreTrainedModel
 
@@ -22,6 +23,13 @@
 from .utils import post_process, pre_process
 
 
+@deprecated(
+    message=(
+        "Training support will be removed in future releases. Please use "
+        "the llmcompressor Axolotl integration for fine-tuning "
+        "https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open"  # noqa: E501
+    )
+)
 def train(**kwargs) -> PreTrainedModel:
     """
     Fine-tuning entrypoint that supports vanilla fine-tuning and
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -139,8 +139,10 @@ class AWQModifier(Modifier, QuantizationMixin):
         default_factory=dict
     )
 
+    # NOTE: different name chosen to avoid collision with
+    # QuantizationMixin.validate_model_after, which must be called first
     @model_validator(mode="after")
-    def validate_model_after(model: "AWQModifier") -> "AWQModifier":
+    def validate_awq_after(model: "AWQModifier") -> "AWQModifier":
         """
         Confirm only one configuration for group_size, symmetric, and num_bits,
         as AWQ algorithm depends on it
diff --git a/src/llmcompressor/modifiers/transform/quip/base.py b/src/llmcompressor/modifiers/transform/quip/base.py
@@ -135,7 +135,7 @@ def _create_config(self) -> TransformConfig:
     def _create_v_scheme(self) -> TransformScheme:
         return TransformScheme(
             type=self.transform_type,
-            block_size=self.transform_block_size,
+            head_dim=self.transform_block_size,
             apply=[
                 TransformArgs(
                     targets=self.targets,
@@ -157,7 +157,7 @@ def _create_v_scheme(self) -> TransformScheme:
     def _create_u_scheme(self) -> TransformScheme:
         return TransformScheme(
             type=self.transform_type,
-            block_size=self.transform_block_size,
+            head_dim=self.transform_block_size,
             apply=[
                 TransformArgs(
                     targets=self.targets,
diff --git a/src/llmcompressor/modifiers/transform/spinquant/base.py b/src/llmcompressor/modifiers/transform/spinquant/base.py
@@ -193,7 +193,7 @@ def _create_r1_scheme(self) -> TransformScheme:
             randomize=self.randomize,
             requires_grad=self.learnable,
             precision=self.precision,
-            block_size=self.transform_block_size,
+            head_dim=self.transform_block_size,
             apply=[
                 TransformArgs(
                     targets=[
@@ -240,7 +240,7 @@ def _create_r2_scheme(self, model: PreTrainedModel) -> TransformScheme:
             randomize=self.randomize,
             requires_grad=self.learnable,
             precision=self.precision,
-            block_size=head_dim,
+            head_dim=head_dim,
             apply=[
                 TransformArgs(targets=[self.mappings.attn_v], location="weight_output"),
                 TransformArgs(
@@ -262,7 +262,7 @@ def _create_r4_scheme(self) -> TransformScheme:
             randomize=self.randomize,
             requires_grad=self.learnable,
             precision=self.precision,
-            block_size=self.transform_block_size,
+            head_dim=self.transform_block_size,
             apply=[
                 TransformArgs(
                     targets=[*self.mappings.mlp_out],
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_asym_awq.yaml
@@ -1,9 +1,8 @@
 cadence: "nightly"
 test_type: "regression"
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml
-dataset_id: "mit-han-lab/pile-val-backup"
-dataset_split: validation
-num_calibration_samples: 2000
+recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
 scheme: W4A16_weight_asym_awq
 save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-asym-awq
diff --git a/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml b/tests/e2e/vLLM/configs/w4a16_grouped_quant_sym_awq.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_sym.yaml
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+scheme: W4A16_weight_sym_awq
+save_dir: TinyLlama-1.1B-Chat-v1.0-w4a16-sym-awq
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_awq_asym.yaml
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -145,15 +145,19 @@ def test_perplexity(setup_model_and_config):
     dispatch_for_generation(model)
 
     total_ppl = 0.0
-    total_non_nan = 0
-    for idx, sample in enumerate(dataloader):
-        if idx >= config["num_eval"]:
+    total_samples = 0
+    for sample in dataloader:
+        if total_samples >= config["num_eval"]:
             break
-        output = model(**tensors_to_device(sample, "cuda:0"))
-        if torch.isnan(output.loss):
+        # -100 in labels indicates that the token is not part of the loss calculation
+        pct_labels_in_sample = (sample["labels"] != -100).to(torch.float).mean().item()
+        if pct_labels_in_sample <= 0.25:
+            # At least 25% of the tokens in the sample must be part of loss calculation
+            # otherwise the perplexity is too volatile and can skew the results
             continue
+        output = model(**tensors_to_device(sample, "cuda:0"))
         total_ppl += torch.exp(output.loss).item()
-        total_non_nan += 1
+        total_samples += 1
 
-    avg_ppl = total_ppl / total_non_nan
+    avg_ppl = total_ppl / total_samples
     assert avg_ppl <= config["ppl_threshold"]
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -285,18 +285,6 @@ def process(sample):
                 "images": sample["image"],
             }
 
-    elif ds_name == "pile-val-backup":
-
-        def preprocess(example):
-            return {
-                "input_ids": processor.encode(example["text"].strip())[:max_seq_length]
-            }
-
-        ds = ds.map(preprocess, remove_columns=ds.column_names)
-        # Note: potentially swap filtering to pad for AWQ
-        ds = ds.filter(lambda example: len(example["input_ids"]) >= max_seq_length)
-        return ds
-
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")