Skip to content

Commit 4c84571

Browse files
committed
[Examples] Add Qwen3.5-27B NVFP4A16 and MXFP4A16 quantization examples
Signed-off-by: Ziming <frankziming26@outlook.com>
1 parent 370c04c commit 4c84571

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from compressed_tensors.offload import dispatch_model
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.quantization import QuantizationModifier
6+
7+
# Load model.
8+
MODEL_ID = "Qwen/Qwen3.5-27B"
9+
model = AutoModelForCausalLM.from_pretrained(
10+
MODEL_ID, dtype="auto", trust_remote_code=True
11+
)
12+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13+
14+
# Configure the quantization algorithm and scheme.
15+
# In this case, we:
16+
# * quantize the weights to fp4 with per group 32 via ptq
17+
# * skip the visual encoder, lm_head, linear attention (Gated DeltaNet
18+
# fused projections are incompatible with microscale formats), and MTP modules
19+
recipe = QuantizationModifier(
20+
targets="Linear",
21+
scheme="MXFP4A16",
22+
ignore=[
23+
"lm_head",
24+
"re:.*visual.*",
25+
"re:.*linear_attn.*",
26+
"re:.*mtp.*",
27+
],
28+
)
29+
30+
# Apply quantization.
31+
oneshot(model=model, recipe=recipe)
32+
33+
print("\n\n========== SAMPLE GENERATION ==============")
34+
dispatch_model(model)
35+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
36+
model.device
37+
)
38+
output = model.generate(input_ids, max_new_tokens=100)
39+
print(tokenizer.decode(output[0], skip_special_tokens=True))
40+
print("==========================================\n\n")
41+
42+
# Save to disk in compressed-tensors format.
43+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-MXFP4A16"
44+
model.save_pretrained(SAVE_DIR, save_compressed=True)
45+
tokenizer.save_pretrained(SAVE_DIR)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from compressed_tensors.offload import dispatch_model
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.quantization import QuantizationModifier
6+
7+
# Load model.
8+
MODEL_ID = "Qwen/Qwen3.5-27B"
9+
model = AutoModelForCausalLM.from_pretrained(
10+
MODEL_ID, dtype="auto", trust_remote_code=True
11+
)
12+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13+
14+
# Configure the quantization algorithm and scheme.
15+
# In this case, we:
16+
# * quantize the weights to fp4 with per group 16 via ptq
17+
# * skip the visual encoder, lm_head, linear attention (Gated DeltaNet
18+
# fused projections are incompatible with NVFP4), and MTP modules
19+
recipe = QuantizationModifier(
20+
targets="Linear",
21+
scheme="NVFP4A16",
22+
ignore=[
23+
"lm_head",
24+
"re:.*visual.*",
25+
"re:.*linear_attn.*",
26+
"re:.*mtp.*",
27+
],
28+
)
29+
30+
# Apply quantization.
31+
oneshot(model=model, recipe=recipe)
32+
33+
print("\n\n========== SAMPLE GENERATION ==============")
34+
dispatch_model(model)
35+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
36+
model.device
37+
)
38+
output = model.generate(input_ids, max_new_tokens=100)
39+
print(tokenizer.decode(output[0], skip_special_tokens=True))
40+
print("==========================================\n\n")
41+
42+
# Save to disk in compressed-tensors format.
43+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
44+
model.save_pretrained(SAVE_DIR, save_compressed=True)
45+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)