refactor(awq): restructure AWQModifier to be similar to SmoothQuantCloses #2327

vishnuprasanth-j · vishnuprasanth-j · commit 1b55a0dc156f · 2026-02-25T01:45:50.000+05:30
diff --git a/examples/awq/fp8_block_llama_example.py b/examples/awq/fp8_block_llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 recipe = [
     AWQModifier(
-        ignore=["lm_head"], scheme="FP8_BLOCK", targets=["Linear"], duo_scaling="both"
+        ignore=["lm_head"],
+        scheme="FP8_BLOCK",
+        targets=["Linear"],
+        duo_scaling="both",
+    ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="FP8_BLOCK",
+        ignore=["lm_head"],
     ),
 ]
 
@@ -76,6 +87,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-fp8-block"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/awq/fp8_dynamic_llama_example.py b/examples/awq/fp8_dynamic_llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 recipe = [
     AWQModifier(
-        ignore=["lm_head"], scheme="FP8_DYNAMIC", targets=["Linear"], duo_scaling="both"
+        ignore=["lm_head"],
+        scheme="FP8_DYNAMIC",
+        targets=["Linear"],
+        duo_scaling="both",
+    ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="FP8_DYNAMIC",
+        ignore=["lm_head"],
     ),
 ]
 
@@ -76,6 +87,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-fp8-dynamic"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/awq/llama_example.py b/examples/awq/llama_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 recipe = [
     AWQModifier(
-        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
+        ignore=["lm_head"],
+        scheme="W4A16_ASYM",
+        targets=["Linear"],
+        duo_scaling="both",
+    ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="W4A16_ASYM",
+        ignore=["lm_head"],
     ),
 ]
 
diff --git a/examples/awq/llama_example_with_masking.py b/examples/awq/llama_example_with_masking.py
@@ -17,6 +17,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -105,9 +106,19 @@ def tokenize(sample):
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 recipe = [
     AWQModifier(
-        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
+        ignore=["lm_head"],
+        scheme="W4A16_ASYM",
+        targets=["Linear"],
+        duo_scaling="both",
+    ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="W4A16_ASYM",
+        ignore=["lm_head"],
     ),
 ]
 
diff --git a/examples/awq/qwen3_coder_moe_example.py b/examples/awq/qwen3_coder_moe_example.py
@@ -4,18 +4,26 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 recipe = [
     AWQModifier(
         duo_scaling=False,
         ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
         scheme="W4A16",
         targets=["Linear"],
     ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="W4A16",
+        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+    ),
 ]
 
 # Select calibration dataset.
diff --git a/examples/awq/qwen3_moe_example.py b/examples/awq/qwen3_moe_example.py
@@ -4,6 +4,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
@@ -49,13 +50,20 @@ def tokenize(sample):
 
 
 # Configure the quantization algorithm to run.
+# AWQModifier performs smoothing and must be followed by a QuantizationModifier
+# which applies the actual quantization.
 # NOTE: vllm currently does not support asym MoE, using symmetric here
 recipe = [
     AWQModifier(
         ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
         scheme="W4A16",
         targets=["Linear"],
     ),
+    QuantizationModifier(
+        targets="Linear",
+        scheme="W4A16",
+        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+    ),
 ]
 
 # Apply algorithms.
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
diff --git a/src/llmcompressor/recipe/recipe.py b/src/llmcompressor/recipe/recipe.py
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py