Skip to content

Commit 2f4f28a

Browse files
committed
Add example
1 parent c3acdee commit 2f4f28a

File tree

1 file changed

+25
-0
lines changed

1 file changed

+25
-0
lines changed

examples/example_static_kvcache.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from datasets import load_dataset
2+
from transformers import AutoTokenizer
3+
4+
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
5+
6+
pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
7+
quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8-KV"
8+
9+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
10+
tokenizer.pad_token = tokenizer.eos_token
11+
12+
ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
13+
examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
14+
examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
15+
16+
quantize_config = BaseQuantizeConfig(
17+
quant_method="fp8",
18+
activation_scheme="static",
19+
ignore_patterns=["re:.*lm_head"],
20+
kv_cache_quant_targets=("k_proj", "v_proj"),
21+
)
22+
23+
model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
24+
model.quantize(examples)
25+
model.save_quantized(quantized_model_dir)

0 commit comments

Comments
 (0)