forked from vllm-project/llm-compressor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkimi_k2_thinking_nvfp4a16.py
More file actions
36 lines (31 loc) · 1.08 KB
/
kimi_k2_thinking_nvfp4a16.py
File metadata and controls
36 lines (31 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""
NOTE: Please run the following script before using `model_free_ptq`
This script is used to reindex the safetensors files of a model such that all fused
modules (gate_up, qkv) are in the same safetensors file. This is required by
model_free_ptq for microscale schemes (NVFP4A16, MXFP4A16)
llmcompressor.reindex_fused_weights \
unsloth/Kimi-K2-Thinking-BF16 \
Kimi-K2-Thinking-BF16-reindexed \
--num_workers=10
"""
from llmcompressor import model_free_ptq
MODEL_ID = "unsloth/Kimi-K2-Thinking-BF16"
REINDEX_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-reindexed"
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
# See above notice pertaining to safetensors reindexing
# After running `llmcompressor.reindex_fused_weights`,
# use `model_free_ptq` to apply NVFP4A16 quantization
model_free_ptq(
model_stub=REINDEX_DIR,
save_directory=SAVE_DIR,
scheme="NVFP4A16",
ignore=[
"re:.*gate$",
"lm_head",
"re:.*kv_a_proj_with_mqa$",
"re:.*q_a_proj$",
"model.embed_tokens",
],
max_workers=15,
device="cuda:0",
)