Skip to content

Commit 1b55a0d

Browse files
refactor(awq): restructure AWQModifier to be similar to SmoothQuantCloses #2327
1 parent 5f63d7a commit 1b55a0d

File tree

9 files changed

+446
-205
lines changed

9 files changed

+446
-205
lines changed

examples/awq/fp8_block_llama_example.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.awq import AWQModifier
7+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
78

89
# Select model and load it.
910
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
4950

5051

5152
# Configure the quantization algorithm to run.
53+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
54+
# which applies the actual quantization.
5255
recipe = [
5356
AWQModifier(
54-
ignore=["lm_head"], scheme="FP8_BLOCK", targets=["Linear"], duo_scaling="both"
57+
ignore=["lm_head"],
58+
scheme="FP8_BLOCK",
59+
targets=["Linear"],
60+
duo_scaling="both",
61+
),
62+
QuantizationModifier(
63+
targets="Linear",
64+
scheme="FP8_BLOCK",
65+
ignore=["lm_head"],
5566
),
5667
]
5768

@@ -76,6 +87,6 @@ def tokenize(sample):
7687
print("==========================================\n\n")
7788

7889
# Save to disk compressed.
79-
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
90+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-fp8-block"
8091
model.save_pretrained(SAVE_DIR, save_compressed=True)
8192
tokenizer.save_pretrained(SAVE_DIR)

examples/awq/fp8_dynamic_llama_example.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.awq import AWQModifier
7+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
78

89
# Select model and load it.
910
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
4950

5051

5152
# Configure the quantization algorithm to run.
53+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
54+
# which applies the actual quantization.
5255
recipe = [
5356
AWQModifier(
54-
ignore=["lm_head"], scheme="FP8_DYNAMIC", targets=["Linear"], duo_scaling="both"
57+
ignore=["lm_head"],
58+
scheme="FP8_DYNAMIC",
59+
targets=["Linear"],
60+
duo_scaling="both",
61+
),
62+
QuantizationModifier(
63+
targets="Linear",
64+
scheme="FP8_DYNAMIC",
65+
ignore=["lm_head"],
5566
),
5667
]
5768

@@ -76,6 +87,6 @@ def tokenize(sample):
7687
print("==========================================\n\n")
7788

7889
# Save to disk compressed.
79-
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
90+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-fp8-dynamic"
8091
model.save_pretrained(SAVE_DIR, save_compressed=True)
8192
tokenizer.save_pretrained(SAVE_DIR)

examples/awq/llama_example.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.awq import AWQModifier
7+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
78

89
# Select model and load it.
910
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -49,9 +50,19 @@ def tokenize(sample):
4950

5051

5152
# Configure the quantization algorithm to run.
53+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
54+
# which applies the actual quantization.
5255
recipe = [
5356
AWQModifier(
54-
ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
57+
ignore=["lm_head"],
58+
scheme="W4A16_ASYM",
59+
targets=["Linear"],
60+
duo_scaling="both",
61+
),
62+
QuantizationModifier(
63+
targets="Linear",
64+
scheme="W4A16_ASYM",
65+
ignore=["lm_head"],
5566
),
5667
]
5768

examples/awq/llama_example_with_masking.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from llmcompressor import oneshot
1919
from llmcompressor.modifiers.awq import AWQModifier
20+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
2021

2122
# Select model and load it.
2223
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -105,9 +106,19 @@ def tokenize(sample):
105106
ds = ds.map(tokenize, remove_columns=ds.column_names)
106107

107108
# Configure the quantization algorithm to run.
109+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
110+
# which applies the actual quantization.
108111
recipe = [
109112
AWQModifier(
110-
ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
113+
ignore=["lm_head"],
114+
scheme="W4A16_ASYM",
115+
targets=["Linear"],
116+
duo_scaling="both",
117+
),
118+
QuantizationModifier(
119+
targets="Linear",
120+
scheme="W4A16_ASYM",
121+
ignore=["lm_head"],
111122
),
112123
]
113124

examples/awq/qwen3_coder_moe_example.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,26 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.awq import AWQModifier
7+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
78

89
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
910
SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
1011

1112
# Configure the quantization algorithm to run.
13+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
14+
# which applies the actual quantization.
1215
recipe = [
1316
AWQModifier(
1417
duo_scaling=False,
1518
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
1619
scheme="W4A16",
1720
targets=["Linear"],
1821
),
22+
QuantizationModifier(
23+
targets="Linear",
24+
scheme="W4A16",
25+
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
26+
),
1927
]
2028

2129
# Select calibration dataset.

examples/awq/qwen3_moe_example.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.awq import AWQModifier
7+
from llmcompressor.modifiers.quantization.quantization import QuantizationModifier
78

89
# Select model and load it.
910
MODEL_ID = "Qwen/Qwen3-30B-A3B"
@@ -49,13 +50,20 @@ def tokenize(sample):
4950

5051

5152
# Configure the quantization algorithm to run.
53+
# AWQModifier performs smoothing and must be followed by a QuantizationModifier
54+
# which applies the actual quantization.
5255
# NOTE: vllm currently does not support asym MoE, using symmetric here
5356
recipe = [
5457
AWQModifier(
5558
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
5659
scheme="W4A16",
5760
targets=["Linear"],
5861
),
62+
QuantizationModifier(
63+
targets="Linear",
64+
scheme="W4A16",
65+
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
66+
),
5967
]
6068

6169
# Apply algorithms.

0 commit comments

Comments
 (0)