Skip to content

Commit 9c44074

Browse files
Add NVFP4A16 quantization examples (#1679)
SUMMARY: "Add NVFP4A16 quantization examples" TEST PLAN: "By running the qwen3_example.py file, you can compress the Qwen3-32B model using the NVFP4A16 compression method." --------- Co-authored-by: Dipika Sikka <[email protected]> Co-authored-by: Dipika Sikka <[email protected]>
1 parent 6506253 commit 9c44074

File tree

1 file changed

+34
-0
lines changed

1 file changed

+34
-0
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
3+
from llmcompressor import oneshot
4+
from llmcompressor.modifiers.quantization import QuantizationModifier
5+
from llmcompressor.utils import dispatch_for_generation
6+
7+
# Load model.
8+
MODEL_ID = "Qwen/Qwen3-32B"
9+
model = AutoModelForCausalLM.from_pretrained(
10+
MODEL_ID, torch_dtype="auto", trust_remote_code=True
11+
)
12+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
13+
14+
# Configure the quantization algorithm and scheme.
15+
# In this case, we:
16+
# * quantize the weights to fp4 with per group 16 via ptq
17+
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])
18+
19+
# Apply quantization.
20+
oneshot(model=model, recipe=recipe)
21+
22+
print("\n\n========== SAMPLE GENERATION ==============")
23+
dispatch_for_generation(model)
24+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
25+
model.device
26+
)
27+
output = model.generate(input_ids, max_new_tokens=100)
28+
print(tokenizer.decode(output[0], skip_special_tokens=True))
29+
print("==========================================\n\n")
30+
31+
# Save to disk in compressed-tensors format.
32+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
33+
model.save_pretrained(SAVE_DIR, save_compressed=True)
34+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)