Skip to content

Commit 4b4257f

Browse files
committed
add compress model script
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 53ea307 commit 4b4257f

File tree

1 file changed

+60
-0
lines changed

1 file changed

+60
-0
lines changed

compress_model.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# python3 compress_model.py --model_id meta-llama/Llama-3.2-1B-Instruct --transform_type random-hadamard
2+
import argparse
3+
from transformers import AutoModelForCausalLM, AutoTokenizer
4+
5+
from llmcompressor import oneshot
6+
from llmcompressor.modifiers.quantization import QuantizationModifier
7+
from llmcompressor.modifiers.transform import SpinQuantModifier
8+
from llmcompressor.utils import dispatch_for_generation
9+
10+
def parse_args():
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument("--model_id", type=str, help="Model stub to compress")
13+
parser.add_argument("--transform_type", type=str, default=None, help="Type of transform used in SpinQuantModifier")
14+
parser.add_argument("--scheme", type=str, default=None, help="Quantization scheme (e.g. W4A16)")
15+
return parser.parse_args()
16+
17+
if __name__ == "__main__":
18+
args = parse_args()
19+
20+
# Select model and load it.
21+
MODEL_ID = args.model_id
22+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
23+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24+
25+
# Select number of samples. 512 samples is a good place to start.
26+
# Increasing the number of samples can improve accuracy.
27+
NUM_CALIBRATION_SAMPLES = 512
28+
MAX_SEQUENCE_LENGTH = 2048
29+
30+
# Configure the quantization algorithm to run.
31+
recipe = []
32+
if args.transform_type:
33+
recipe.append(SpinQuantModifier(rotations=["R1", "R2"], transform_type=args.transform_type))
34+
35+
if args.scheme:
36+
recipe.append(QuantizationModifier(targets="Linear", scheme=args.scheme, ignore=["lm_head"]))
37+
38+
# Apply algorithms.
39+
oneshot(
40+
model=model,
41+
recipe=recipe,
42+
dataset="ultrachat_200k",
43+
splits={"calibration": f"train_sft[:{NUM_CALIBRATION_SAMPLES}]"},
44+
max_seq_length=MAX_SEQUENCE_LENGTH,
45+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
46+
)
47+
48+
# Confirm generations of the quantized model look sane.
49+
print("\n\n")
50+
print("========== SAMPLE GENERATION ==============")
51+
dispatch_for_generation(model)
52+
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
53+
output = model.generate(input_ids, max_new_tokens=100)
54+
print(tokenizer.decode(output[0]))
55+
print("==========================================\n\n")
56+
57+
# Save to disk compressed.
58+
SAVE_DIR = MODEL_ID.split("/")[1] + f"-{args.transform_type}-{args.scheme}"
59+
model.save_pretrained(SAVE_DIR, save_compressed=True)
60+
tokenizer.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)