-
Notifications
You must be signed in to change notification settings - Fork 470
Expand file tree
/
Copy pathqwen3_next_example.py
More file actions
36 lines (29 loc) · 969 Bytes
/
qwen3_next_example.py
File metadata and controls
36 lines (29 loc) · 969 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# NOTE: Requires a minimum of transformers 4.57.0
MODEL_ID = "Qwen/Qwen3-Next-80B-A3B-Instruct"
# Load model.
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype="auto",
low_cpu_mem_usage=True,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
recipe = QuantizationModifier(
targets=["Linear"],
scheme="FP8_DYNAMIC",
ignore=[
"lm_head",
"re:.*mlp.gate$",
"re:.*mlp.shared_expert_gate$",
"re:.*linear_attn.*",
],
)
# Apply quantization.
oneshot(model=model, recipe=recipe)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)