[GPTQ] Change actorder default to "static" (#1425)

kylesayrs · dsikka · web-flow · commit 9e916b648258 · 2025-09-11T11:51:55.000-04:00
## Purpose ## * Use best defaults for GPTQ quantization ## Prerequisites ## * #1453 * #1468 ## Changes ## * Set gptq actorder default to "static" ## Testing ## * Ran llama w4a16 example to completion and validated the correct activation ordering --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -72,8 +72,9 @@ class GPTQModifier(Modifier, QuantizationMixin):
     :param block_size: Used to determine number of columns to compress in one pass
     :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
         diagonal norm
-    :param actorder: order in which weight columns are quantized. For more information,
-        on actorder options, see https://github.com/vllm-project/vllm/pull/8135
+    :param actorder: order in which weight columns are quantized. Defaults to "static"
+        activation ordering, which achieves best accuracy recovery with no runtime cost.
+        For more information, see https://github.com/vllm-project/vllm/pull/8135
     :param offload_hessians: Set to True for decreased memory usage but increased
         runtime.
 
@@ -106,7 +107,7 @@ class GPTQModifier(Modifier, QuantizationMixin):
     sequential_targets: Union[str, List[str], None] = None
     block_size: int = 128
     dampening_frac: Optional[float] = 0.01
-    actorder: Optional[Union[ActivationOrdering, Sentinel]] = None
+    actorder: Optional[Union[ActivationOrdering, Sentinel]] = Sentinel("static")
     offload_hessians: bool = False
 
     # private variables
@@ -134,18 +135,17 @@ def resolve_actorder(existing):
                 return ActivationOrdering.STATIC if existing is None else existing
 
             # user-provided value always attempts to override
-            if self.actorder is not None:
-                if existing is None or self.actorder == existing:
-                    return self.actorder
-                raise ValueError(
-                    "Cannot resolve activation ordering when both "
-                    "`GPTQModifier.actorder` and `QuantizationScheme.actorder` "
-                    "are provided and differ. Either set `GPTQModifier.actorder = "
-                    "None` or remove `actorder` from config groups."
-                )
+            if existing is None or self.actorder == existing:
+                return self.actorder
 
-            # setting `GPTQModifier.actorder = None` does nothing
-            return existing
+            # if existing provided and conflicts
+            raise ValueError(
+                "Cannot resolve activation ordering when both "
+                "`GPTQModifier.actorder` and `QuantizationScheme.actorder` "
+                f"are provided and differ ({self.actorder}, {existing}). "
+                "Either unset `GPTQModifier.actorder` or "
+                "remove `actorder` from config groups."
+            )
 
         for scheme in config.config_groups.values():
             assert isinstance(scheme, QuantizationScheme)
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -70,7 +70,10 @@ def data_collator(batch):
         # a compatible preset sceme
         if quant_type == "GPTQ":
             oneshot_kwargs["recipe"] = GPTQModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head"]
+                targets="Linear",
+                scheme=scheme,
+                actorder=None,  # added for consistency with past testing configs
+                ignore=["lm_head"],
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_none.yaml
@@ -0,0 +1,9 @@
+cadence: "nightly"
+test_type: "regression"
+model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
+dataset_id: openai/gsm8k
+dataset_config: main
+dataset_split: train
+scheme: W4A16_actorder_none
+save_dir: TinyLlama-1.1B-Chat-v1.0-actorder-group
diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_none_qwen.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen2.5-0.5B
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
+dataset_id: neuralmagic/LLM_compression_calibration
+dataset_split: train
+scheme: W4A16_actorder_none
+save_dir: Qwen2.5-0.5B-actorder-none
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_static_per_tensor_act.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_dynamic_asym.yaml
@@ -9,6 +9,7 @@ quant_stage:
         - re:.*post_attention_layernorm
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_w8a8_static_asym.yaml
@@ -4,6 +4,7 @@ quant_stage:
       smoothing_strength: 0.8
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 4, type: int, symmetric: true, strategy: channel, dynamic: false}
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_group_quant_asym_awq.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     AWQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 4, type: int, symmetric: false, strategy: "group", group_size: 128}
diff --git a/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml b/tests/e2e/vLLM/recipes/WNA16/recipe_w8a16_channel_quant.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: [lm_head]
+      actorder: null
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel, dynamic: false}
diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml
@@ -11,6 +11,7 @@ quantization_stage:
   quantization_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: null
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml
@@ -11,6 +11,7 @@ quantization_stage:
   quantization_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: null
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: ["lm_head"]
+      actorder: "group"
       config_groups:
         group_0:
           weights:
@@ -10,5 +11,4 @@ quant_stage:
             symmetric: true
             strategy: "group"
             group_size: 128
-            actorder: "group"
           targets: ["Linear"]
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
@@ -0,0 +1,14 @@
+quant_stage:
+  quant_modifiers:
+    GPTQModifier:
+      ignore: ["lm_head"]
+      actorder: null
+      config_groups:
+        group_0:
+          weights:
+            num_bits: 4
+            type: "int"
+            symmetric: true
+            strategy: "group"
+            group_size: 128
+          targets: ["Linear"]
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
@@ -2,6 +2,7 @@ quant_stage:
   quant_modifiers:
     GPTQModifier:
       ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      actorder: "weight"
       config_groups:
         group_0:
           weights:
@@ -10,5 +11,4 @@ quant_stage:
             symmetric: true
             strategy: "group"
             group_size: 128
-            actorder: "weight"
           targets: ["Linear"]
diff --git a/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml b/tests/e2e/vLLM/recipes/kv_cache/gptq.yaml
@@ -3,6 +3,7 @@ quant_stage:
     GPTQModifier:
       sequential_update: false
       ignore: ["lm_head"]
+      actorder: null
       config_groups:
           group_0:
               weights:
diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py
@@ -66,23 +66,28 @@ def test_block_strategy_parsing(block_q_config_kwargs):
 @pytest.mark.parametrize(
     "has_actorder,actorder,config_0,config_1,expected_0,expected_1",
     [
-        # defaults to None if nothing provided
-        (False, None, None, None, None, None),
+        # defaults to "static" if nothing provided
+        (False, "N/A", None, None, "static", "static"),
         # modifier overrides config if no config provided
+        (True, "static", None, None, "static", "static"),
         (True, "group", None, None, "group", "group"),
+        (True, None, None, None, None, None),
         # modifier overrides if config partially matches anyways
         (True, "group", None, "group", "group", "group"),
         (True, "group", "group", None, "group", "group"),
-        # modifier errors if conflict with config
+        # modifier errors if explicitly conflicts with config
+        (True, "static", None, "group", "error", "error"),
+        (True, "static", "group", None, "error", "error"),
         (True, "group", None, "static", "error", "error"),
         (True, "group", "static", None, "error", "error"),
-        # modifier does not override if not provided
-        (False, "N/A", None, None, None, None),
-        (False, "N/A", None, "static", None, "static"),
-        (False, "N/A", "static", None, "static", None),
+        (True, None, "static", None, "error", "error"),
+        # modifier overrides to static if nothing is provided
+        (False, "N/A", None, "static", "static", "static"),
+        (False, "N/A", "static", None, "static", "static"),
         (False, "N/A", "static", "static", "static", "static"),
-        (False, "N/A", None, "group", None, "group"),
-        (False, "N/A", "group", None, "group", None),
+        # modifier does not override set config vaules
+        (False, "N/A", None, "group", "static", "group"),
+        (False, "N/A", "group", None, "group", "static"),
         (False, "N/A", "group", "group", "group", "group"),
     ],
 )
diff --git a/tests/lmeval/configs/w4a16_actorder_none.yaml b/tests/lmeval/configs/w4a16_actorder_none.yaml
@@ -0,0 +1,10 @@
+cadence: "weekly"
+model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: W4A16_actorder_none
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_none.yaml
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72