Skip to content

Commit 3e721cb

Browse files
authored
Add Qwen3-Coder awq moe example (#1863)
SUMMARY: Add a `Qwen3-Coder` awq example based on https://huggingface.co/nm-testing/Qwen3-Coder-30B-A3B-Instruct-W4A16-awq. TEST PLAN: Ran script locally and verified resulting model outputs were reasonable. Prompt: "Write a binary search function" Output: ```python def binary_search(arr, target): left = 0 right = len(arr) - 1 while left <= right: mid = (left + right) // 2 if arr[mid] == target: return mid elif arr[mid] < target: left = mid + 1 else: right = mid - 1 return -1 ``` --------- Signed-off-by: Fynn Schmitt-Ulms <[email protected]>
1 parent a824136 commit 3e721cb

File tree

1 file changed

+81
-0
lines changed

1 file changed

+81
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from datasets import load_dataset
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.awq import AWQModifier
6+
from llmcompressor.utils import dispatch_for_generation
7+
8+
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
9+
SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
10+
11+
# Configure the quantization algorithm to run.
12+
recipe = [
13+
AWQModifier(
14+
duo_scaling=False,
15+
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
16+
scheme="W4A16",
17+
targets=["Linear"],
18+
),
19+
]
20+
21+
# Select calibration dataset.
22+
DATASET_ID = "codeparrot/self-instruct-starcoder"
23+
DATASET_SPLIT = "curated"
24+
25+
# Select number of samples. 256 samples is a good place to start.
26+
# Increasing the number of samples can improve accuracy.
27+
NUM_CALIBRATION_SAMPLES = 256
28+
MAX_SEQUENCE_LENGTH = 2048
29+
30+
31+
def get_calib_dataset(tokenizer):
32+
ds = load_dataset(
33+
DATASET_ID,
34+
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
35+
)
36+
37+
def preprocess(example):
38+
chat_messages = [
39+
{"role": "user", "content": example["instruction"].strip()},
40+
{"role": "assistant", "content": example["output"].strip()},
41+
]
42+
tokenized_messages = tokenizer.apply_chat_template(chat_messages, tokenize=True)
43+
return {"input_ids": tokenized_messages}
44+
45+
ds = (
46+
ds.shuffle(seed=42)
47+
.map(preprocess, remove_columns=ds.column_names)
48+
.select(range(NUM_CALIBRATION_SAMPLES))
49+
)
50+
return ds
51+
52+
53+
if __name__ == "__main__":
54+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
55+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
56+
57+
###
58+
### Apply algorithms.
59+
###
60+
oneshot(
61+
model=model,
62+
dataset=get_calib_dataset(tokenizer),
63+
recipe=recipe,
64+
max_seq_length=MAX_SEQUENCE_LENGTH,
65+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
66+
log_dir=None,
67+
)
68+
69+
# Confirm generations of the quantized model look sane.
70+
print("========== SAMPLE GENERATION ==============")
71+
dispatch_for_generation(model)
72+
input_ids = tokenizer(
73+
"Write a binary search function", return_tensors="pt"
74+
).input_ids.to(model.device)
75+
output = model.generate(input_ids, max_new_tokens=150)
76+
print(tokenizer.decode(output[0]))
77+
print("==========================================\n\n")
78+
79+
# Save model to disk
80+
model.save_pretrained(SAVE_DIR)
81+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)