GPTQ Actorder Refactor (#2541)

HDCharles · web-flow · commit 1690a4cfad9e · 2026-03-31T14:01:41.000Z
Summary:

this code was really spaghetti before, tried to clean it up and make it
a little more readable.
clean up actorder if tower
no longer saving random values we don't need (g_idx that hasn't been
permuted)
update tests to actually test actorder
make tests faster (smaller model + don't tokenize whole dataset)
note: smaller model required me to change groupsize to 32

TEST PLAN:
pytest -vs -rs
/home/HDCharles/repos/llm-compressor/tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py

---------

Signed-off-by: HDCharles &lt;charlesdavidhernandez@gmail.com&gt;
diff --git a/src/llmcompressor/modifiers/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/gptq/gptq_quantize.py
@@ -9,7 +9,6 @@
     QuantizationStrategy,
     fake_quantize,
 )
-from compressed_tensors.utils import update_offload_parameter
 from loguru import logger
 
 from llmcompressor.modifiers.utils import SPARSITY_THRESHOLD
@@ -110,7 +109,8 @@ def quantize_weight(
     num_rows = W.shape[0]
     num_columns = W.shape[1]
 
-    # generate scale, should include tensor group / use global scale
+    scale, zero_point = observer(W)
+    # handle g_idx and activation ordering
     if strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
         # mapping from column index to group index
         g_idx = (
@@ -119,26 +119,16 @@ def quantize_weight(
         )
 
         if actorder == ActivationOrdering.GROUP:
-            # permute by activation order first, then update groups
             W, H, perm = _apply_activation_ordering(W, H)
-            update_offload_parameter(module, "weight_g_idx", g_idx)
+            # actually need scale/zp for permuted weight for this format
             scale, zero_point = observer(W)
-
             # use identity g_idx (invert permutation later)
 
         elif actorder == ActivationOrdering.WEIGHT:
-            # update groups first, then permute by activation order
-            scale, zero_point = observer(W)
+            # permute weights and g_idx
             W, H, perm = _apply_activation_ordering(W, H)
-
-            # permute g_idx to maintain identity mapping after unpermutation
             g_idx = g_idx[perm]
 
-        else:
-            scale, zero_point = observer(W)
-    else:
-        scale, zero_point = observer(W)
-
     # sparsity mask
     sparsity = tensor_sparsity(W)
     preserve_zeros = sparsity >= SPARSITY_THRESHOLD
diff --git a/tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py b/tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py
@@ -1,7 +1,11 @@
 import pytest
 import torch
-from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
-from transformers import AutoModelForCausalLM
+from compressed_tensors.quantization import (
+    ActivationOrdering,
+    QuantizationArgs,
+    QuantizationScheme,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.gptq import GPTQModifier
@@ -36,17 +40,60 @@
     config_groups={
         "group_0": QuantizationScheme(
             targets=["re:.*model.layers.2.self_attn.q_proj$"],
-            weights=QuantizationArgs(num_bits=4, strategy="group", group_size=128),
+            weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
         )
     },
 )
 
 recipe_modifier_shorthand_a = GPTQModifier(
-    ignore=["lm_head"], targets="re:.*model.layers.2.self_attn.q_proj$", scheme="W4A16"
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
+            weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
+        )
+    },
 )
 
 recipe_modifier_shorthand_b = GPTQModifier(
-    ignore=["lm_head"], scheme={"W4A16": ["re:.*model.layers.2.self_attn.q_proj$"]}
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
+            weights=QuantizationArgs(num_bits=4, strategy="group", group_size=32),
+        )
+    },
+)
+
+# Test activation ordering variants
+recipe_modifier_group_actorder_weight = GPTQModifier(
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                strategy="group",
+                group_size=32,
+                actorder=ActivationOrdering.WEIGHT,
+            ),
+        )
+    },
+)
+
+recipe_modifier_group_actorder_group = GPTQModifier(
+    ignore=["lm_head"],
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["re:.*model.layers.2.self_attn.q_proj$"],
+            weights=QuantizationArgs(
+                num_bits=4,
+                strategy="group",
+                group_size=32,
+                actorder=ActivationOrdering.GROUP,
+            ),
+        )
+    },
 )
 
 
@@ -58,20 +105,38 @@
         recipe_modifier_full_group,
         recipe_modifier_shorthand_a,
         recipe_modifier_shorthand_b,
+        recipe_modifier_group_actorder_weight,
+        recipe_modifier_group_actorder_group,
     ],
 )
 def test_oneshot_application(recipe, tmp_path):
     output = tmp_path / "oneshot_output"
-    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    model_id = "nm-testing/tinysmokellama-3.2"
     dataset = "open_platypus"
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
+    # Load original model for numerical comparison
+    original_model = AutoModelForCausalLM.from_pretrained(
+        model_id, torch_dtype=torch.float16, device_map=device
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    # Create test input
+    test_text = "The quick brown fox jumps over the lazy dog"
+    inputs = tokenizer(test_text, return_tensors="pt").to(device)
+
+    # Get original model output
+    with torch.no_grad():
+        original_output = original_model(**inputs).logits
+
+    # Quantize model
     oneshot(
-        model=model,
+        model=model_id,
         dataset=dataset,
         output_dir=output,
         recipe=recipe,
         num_calibration_samples=9,
+        splits={"calibration": "train[:9]"},
     )
     model_loaded = AutoModelForCausalLM.from_pretrained(output, device_map=device)
 
@@ -98,3 +163,32 @@ def test_oneshot_application(recipe, tmp_path):
     # Check lm-head is not quantized
     not_targetted = model_loaded.lm_head
     assert not hasattr(not_targetted, "quantization_scheme")
+
+    # Verify g_idx behavior for activation ordering
+    if weight_args.actorder == ActivationOrdering.GROUP:
+        # GROUP actorder should save g_idx
+        assert hasattr(
+            targetted_linear_layer, "weight_g_idx"
+        ), "GROUP actorder should have g_idx"
+    elif weight_args.actorder == ActivationOrdering.WEIGHT:
+        # WEIGHT actorder should NOT save g_idx (identity mapping)
+        assert not hasattr(
+            targetted_linear_layer, "weight_g_idx"
+        ), "WEIGHT actorder should not have g_idx"
+
+    # Numerical validation: check MSE
+    with torch.no_grad():
+        quantized_output = model_loaded(**inputs).logits
+
+    mse = torch.nn.functional.mse_loss(quantized_output, original_output).item()
+
+    # MSE threshold - quantization should not degrade quality too much
+    mse_threshold = 0.015
+    assert mse < mse_threshold, (
+        f"MSE {mse:.6f} exceeds threshold {mse_threshold}. "
+        f"Quantization degraded model quality too much."
+    )
+
+    # Cleanup
+    del original_model, model_loaded
+    torch.cuda.empty_cache()