Skip to content

Commit fbf2a6d

Browse files
committed
update examples
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 8ba0f2c commit fbf2a6d

34 files changed

+270
-276
lines changed

examples/awq/llama_example.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
from llmcompressor.modifiers.awq import AWQModifier
66

77
# Select model and load it.
8-
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
8+
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
99

10-
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
11-
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
10+
model = AutoModelForCausalLM.from_pretrained(
11+
MODEL_ID, device_map="auto", torch_dtype="auto"
12+
)
13+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
1214

1315
# Select calibration dataset.
1416
DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -61,18 +63,15 @@ def tokenize(sample):
6163
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
6264
)
6365

64-
# Save to disk compressed.
65-
SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
66-
model.save_pretrained(SAVE_DIR, save_compressed=True)
67-
tokenizer.save_pretrained(SAVE_DIR)
68-
69-
# Load model after saving
70-
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
71-
7266
# Confirm generations of the quantized model look sane.
7367
print("\n\n")
7468
print("========== SAMPLE GENERATION ==============")
7569
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
7670
output = model.generate(input_ids, max_new_tokens=100)
7771
print(tokenizer.decode(output[0]))
7872
print("==========================================\n\n")
73+
74+
# Save to disk compressed.
75+
SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
76+
model.save_pretrained(SAVE_DIR, save_compressed=True)
77+
tokenizer.save_pretrained(SAVE_DIR)

examples/awq/qwen3_moe_example.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33

44
from llmcompressor import oneshot
55
from llmcompressor.modifiers.awq import AWQModifier
6+
from llmcompressor.utils.dev import dispatch_for_generation
67

78
# Select model and load it.
8-
model_id = "Qwen/Qwen3-30B-A3B"
9-
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
10-
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
9+
MODEL_ID = "Qwen/Qwen3-30B-A3B"
10+
11+
model = AutoModelForCausalLM.from_pretrained(
12+
MODEL_ID, device_map="auto", torch_dtype="auto"
13+
)
14+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
1115

1216
# Select calibration dataset.
1317
DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -65,18 +69,16 @@ def tokenize(sample):
6569
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
6670
)
6771

68-
# Save to disk compressed.
69-
SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
70-
model.save_pretrained(SAVE_DIR, save_compressed=True)
71-
tokenizer.save_pretrained(SAVE_DIR)
72-
73-
# Load model after saving
74-
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
75-
7672
# Confirm generations of the quantized model look sane.
7773
print("\n\n")
7874
print("========== SAMPLE GENERATION ==============")
75+
dispatch_for_generation(model)
7976
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
8077
output = model.generate(input_ids, max_new_tokens=100)
8178
print(tokenizer.decode(output[0]))
8279
print("==========================================\n\n")
80+
81+
# Save to disk compressed.
82+
SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
83+
model.save_pretrained(SAVE_DIR, save_compressed=True)
84+
tokenizer.save_pretrained(SAVE_DIR)

examples/compressed_inference/fp8_compressed_inference.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919
"def fibonacci(n):",
2020
]
2121

22-
compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto")
22+
compressed_model = AutoModelForCausalLM.from_pretrained(
23+
MODEL_STUB,
24+
torch_dtype="auto",
25+
device_map="cuda:0",
26+
)
2327

2428
# tokenize the sample data
2529
tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)

examples/multimodal_audio/whisper_example.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44

55
from llmcompressor import oneshot
66
from llmcompressor.modifiers.quantization import GPTQModifier
7+
from llmcompressor.utils.dev import dispatch_for_generation
78

89
# Select model and load it.
9-
model_id = "openai/whisper-large-v3"
10-
model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
10+
MODEL_ID = "openai/whisper-large-v3"
11+
12+
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
1113
model.config.forced_decoder_ids = None
12-
processor = WhisperProcessor.from_pretrained(model_id)
14+
processor = WhisperProcessor.from_pretrained(MODEL_ID)
1315

1416
# Configure processor the dataset task.
1517
processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -83,17 +85,10 @@ def data_collator(batch):
8385
data_collator=data_collator,
8486
)
8587

86-
# Save to disk compressed.
87-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
88-
model.save_pretrained(SAVE_DIR, save_compressed=True)
89-
processor.save_pretrained(SAVE_DIR)
90-
91-
# Load model after saving
92-
model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
93-
9488
# Confirm generations of the quantized model look sane.
9589
print("\n\n")
9690
print("========== SAMPLE GENERATION ==============")
91+
dispatch_for_generation(model)
9792
sample_features = next(iter(ds))["input_features"]
9893
sample_decoder_ids = [processor.tokenizer.prefix_tokens]
9994
sample_input = {
@@ -106,3 +101,8 @@ def data_collator(batch):
106101
# that's where you have a lot of windows in the south no actually that's passive solar
107102
# and passive solar is something that was developed and designed in the 1960s and 70s
108103
# and it was a great thing for what it was at the time but it's not a passive house
104+
105+
# Save to disk compressed.
106+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
107+
model.save_pretrained(SAVE_DIR, save_compressed=True)
108+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/gemma3_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from llmcompressor import oneshot
77
from llmcompressor.modifiers.quantization import GPTQModifier
8+
from llmcompressor.utils.dev import dispatch_for_generation
89

910
# Load model.
1011
model_id = "google/gemma-3-4b-it"
@@ -46,16 +47,9 @@ def data_collator(batch):
4647
data_collator=data_collator,
4748
)
4849

49-
# Save to disk compressed.
50-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
51-
model.save_pretrained(SAVE_DIR, save_compressed=True)
52-
processor.save_pretrained(SAVE_DIR)
53-
54-
# Load model after saving
55-
model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
56-
5750
# Confirm generations of the quantized model look sane.
5851
print("========== SAMPLE GENERATION ==============")
52+
dispatch_for_generation(model)
5953
messages = [
6054
{
6155
"role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
7468
output = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
7569
print(processor.decode(output[0], skip_special_tokens=True))
7670
print("==========================================")
71+
72+
# Save to disk compressed.
73+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
74+
model.save_pretrained(SAVE_DIR, save_compressed=True)
75+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/idefics3_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from llmcompressor import oneshot
88
from llmcompressor.modifiers.quantization import GPTQModifier
9+
from llmcompressor.utils.dev import dispatch_for_generation
910

1011
# Load model.
1112
model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct"
@@ -92,16 +93,9 @@ def tokenize(sample):
9293
data_collator=data_collator,
9394
)
9495

95-
# Save to disk compressed.
96-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
97-
model.save_pretrained(SAVE_DIR, save_compressed=True)
98-
processor.save_pretrained(SAVE_DIR)
99-
100-
# Load model after saving
101-
model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
102-
10396
# Confirm generations of the quantized model look sane.
10497
print("========== SAMPLE GENERATION ==============")
98+
dispatch_for_generation(model)
10599
messages = [
106100
{
107101
"role": "user",
@@ -119,3 +113,8 @@ def tokenize(sample):
119113
output = model.generate(**inputs, max_new_tokens=100)
120114
print(processor.decode(output[0], skip_special_tokens=True))
121115
print("==========================================")
116+
117+
# Save to disk compressed.
118+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
119+
model.save_pretrained(SAVE_DIR, save_compressed=True)
120+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/llava_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from llmcompressor import oneshot
77
from llmcompressor.modifiers.quantization import GPTQModifier
8+
from llmcompressor.utils.dev import dispatch_for_generation
89

910
# Load model.
1011
model_id = "llava-hf/llava-1.5-7b-hf"
@@ -47,16 +48,9 @@ def data_collator(batch):
4748
data_collator=data_collator,
4849
)
4950

50-
# Save to disk compressed.
51-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
52-
model.save_pretrained(SAVE_DIR, save_compressed=True)
53-
processor.save_pretrained(SAVE_DIR)
54-
55-
# Load model after saving
56-
model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
57-
5851
# Confirm generations of the quantized model look sane.
5952
print("========== SAMPLE GENERATION ==============")
53+
dispatch_for_generation(model)
6054
messages = [
6155
{
6256
"role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
7468
output = model.generate(**inputs, max_new_tokens=100)
7569
print(processor.decode(output[0], skip_special_tokens=True))
7670
print("==========================================")
71+
72+
# Save to disk compressed.
73+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
74+
model.save_pretrained(SAVE_DIR, save_compressed=True)
75+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/mistral3_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from llmcompressor import oneshot
1010
from llmcompressor.modifiers.quantization import GPTQModifier
11+
from llmcompressor.utils.dev import dispatch_for_generation
1112

1213
# Load model.
1314
model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@@ -60,16 +61,9 @@ def data_collator(batch):
6061
data_collator=data_collator,
6162
)
6263

63-
# Save to disk compressed.
64-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
65-
model.save_pretrained(SAVE_DIR, save_compressed=True)
66-
processor.save_pretrained(SAVE_DIR)
67-
68-
# Load model after saving
69-
model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
70-
7164
# Confirm generations of the quantized model look sane.
7265
print("========== SAMPLE GENERATION ==============")
66+
dispatch_for_generation(model)
7367
messages = [
7468
{
7569
"role": "user",
@@ -88,3 +82,8 @@ def data_collator(batch):
8882
output = model.generate(**inputs, max_new_tokens=100)
8983
print(processor.decode(output[0], skip_special_tokens=True))
9084
print("==========================================")
85+
86+
# Save to disk compressed.
87+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
88+
model.save_pretrained(SAVE_DIR, save_compressed=True)
89+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/mllama_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from llmcompressor import oneshot
77
from llmcompressor.modifiers.quantization import GPTQModifier
8+
from llmcompressor.utils.dev import dispatch_for_generation
89

910
# Load model.
1011
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -47,16 +48,9 @@ def data_collator(batch):
4748
data_collator=data_collator,
4849
)
4950

50-
# Save to disk compressed.
51-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
52-
model.save_pretrained(SAVE_DIR, save_compressed=True)
53-
processor.save_pretrained(SAVE_DIR)
54-
55-
# Load model after saving
56-
model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
57-
5851
# Confirm generations of the quantized model look sane.
5952
print("========== SAMPLE GENERATION ==============")
53+
dispatch_for_generation(model)
6054
messages = [
6155
{
6256
"role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
7468
output = model.generate(**inputs, max_new_tokens=100)
7569
print(processor.decode(output[0], skip_special_tokens=True))
7670
print("==========================================")
71+
72+
# Save to disk compressed.
73+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
74+
model.save_pretrained(SAVE_DIR, save_compressed=True)
75+
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/phi3_vision_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from llmcompressor import oneshot
99
from llmcompressor.modifiers.quantization import GPTQModifier
10+
from llmcompressor.utils.dev import dispatch_for_generation
1011

1112
# Load model.
1213
model_id = "microsoft/Phi-3-vision-128k-instruct"
@@ -78,14 +79,6 @@ def data_collator(batch):
7879
ignore=["lm_head", "re:model.vision_embed_tokens.*"],
7980
)
8081

81-
# Save to disk compressed.
82-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
83-
model.save_pretrained(SAVE_DIR, save_compressed=True)
84-
processor.save_pretrained(SAVE_DIR)
85-
86-
# Load model after saving
87-
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
88-
8982
# Perform oneshot
9083
oneshot(
9184
model=model,
@@ -99,7 +92,13 @@ def data_collator(batch):
9992

10093
# Confirm generations of the quantized model look sane.
10194
print("========== SAMPLE GENERATION ==============")
95+
dispatch_for_generation(model)
10296
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
10397
output = model.generate(input_ids, max_new_tokens=20)
10498
print(processor.decode(output[0]))
10599
print("==========================================")
100+
101+
# Save to disk compressed.
102+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
103+
model.save_pretrained(SAVE_DIR, save_compressed=True)
104+
processor.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)