Skip to content

Commit a275f53

Browse files
committed
update examples to load before generating
Signed-off-by: Kyle Sayers <[email protected]>
1 parent 0348243 commit a275f53

31 files changed

+227
-129
lines changed

examples/awq/llama_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,18 @@ def tokenize(sample):
6161
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
6262
)
6363

64+
# Save to disk compressed.
65+
SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
66+
model.save_pretrained(SAVE_DIR, save_compressed=True)
67+
tokenizer.save_pretrained(SAVE_DIR)
68+
69+
# Load model after saving
70+
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
71+
6472
# Confirm generations of the quantized model look sane.
6573
print("\n\n")
6674
print("========== SAMPLE GENERATION ==============")
6775
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
6876
output = model.generate(input_ids, max_new_tokens=100)
6977
print(tokenizer.decode(output[0]))
7078
print("==========================================\n\n")
71-
72-
# Save to disk compressed.
73-
SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
74-
model.save_pretrained(SAVE_DIR, save_compressed=True)
75-
tokenizer.save_pretrained(SAVE_DIR)

examples/awq/qwen3_moe_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,18 @@ def tokenize(sample):
6565
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
6666
)
6767

68+
# Save to disk compressed.
69+
SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
70+
model.save_pretrained(SAVE_DIR, save_compressed=True)
71+
tokenizer.save_pretrained(SAVE_DIR)
72+
73+
# Load model after saving
74+
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
75+
6876
# Confirm generations of the quantized model look sane.
6977
print("\n\n")
7078
print("========== SAMPLE GENERATION ==============")
7179
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
7280
output = model.generate(input_ids, max_new_tokens=100)
7381
print(tokenizer.decode(output[0]))
7482
print("==========================================\n\n")
75-
76-
# Save to disk compressed.
77-
SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
78-
model.save_pretrained(SAVE_DIR, save_compressed=True)
79-
tokenizer.save_pretrained(SAVE_DIR)

examples/multimodal_audio/whisper_example.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,14 @@ def data_collator(batch):
8383
data_collator=data_collator,
8484
)
8585

86+
# Save to disk compressed.
87+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
88+
model.save_pretrained(SAVE_DIR, save_compressed=True)
89+
processor.save_pretrained(SAVE_DIR)
90+
91+
# Load model after saving
92+
model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
93+
8694
# Confirm generations of the quantized model look sane.
8795
print("\n\n")
8896
print("========== SAMPLE GENERATION ==============")
@@ -92,15 +100,9 @@ def data_collator(batch):
92100
"input_features": torch.tensor(sample_features).to(model.device),
93101
"decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
94102
}
95-
96103
output = model.generate(**sample_input, language="en")
97104
print(processor.batch_decode(output, skip_special_tokens=True))
98105
print("==========================================\n\n")
99106
# that's where you have a lot of windows in the south no actually that's passive solar
100107
# and passive solar is something that was developed and designed in the 1960s and 70s
101108
# and it was a great thing for what it was at the time but it's not a passive house
102-
103-
# Save to disk compressed.
104-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
105-
model.save_pretrained(SAVE_DIR, save_compressed=True)
106-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/gemma3_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ def data_collator(batch):
4646
data_collator=data_collator,
4747
)
4848

49+
# Save to disk compressed.
50+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
51+
model.save_pretrained(SAVE_DIR, save_compressed=True)
52+
processor.save_pretrained(SAVE_DIR)
53+
54+
# Load model after saving
55+
model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
56+
4957
# Confirm generations of the quantized model look sane.
5058
print("========== SAMPLE GENERATION ==============")
5159
messages = [
@@ -65,8 +73,3 @@ def data_collator(batch):
6573
output = model.generate(**inputs, max_new_tokens=100)
6674
print(processor.decode(output[0], skip_special_tokens=True))
6775
print("==========================================")
68-
69-
# Save to disk compressed.
70-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
71-
model.save_pretrained(SAVE_DIR, save_compressed=True)
72-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/idefics3_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ def tokenize(sample):
9292
data_collator=data_collator,
9393
)
9494

95+
# Save to disk compressed.
96+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
97+
model.save_pretrained(SAVE_DIR, save_compressed=True)
98+
processor.save_pretrained(SAVE_DIR)
99+
100+
# Load model after saving
101+
model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
102+
95103
# Confirm generations of the quantized model look sane.
96104
print("========== SAMPLE GENERATION ==============")
97105
messages = [
@@ -111,8 +119,3 @@ def tokenize(sample):
111119
output = model.generate(**inputs, max_new_tokens=100)
112120
print(processor.decode(output[0], skip_special_tokens=True))
113121
print("==========================================")
114-
115-
# Save to disk compressed.
116-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
117-
model.save_pretrained(SAVE_DIR, save_compressed=True)
118-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/llava_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ def data_collator(batch):
4747
data_collator=data_collator,
4848
)
4949

50+
# Save to disk compressed.
51+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
52+
model.save_pretrained(SAVE_DIR, save_compressed=True)
53+
processor.save_pretrained(SAVE_DIR)
54+
55+
# Load model after saving
56+
model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
57+
5058
# Confirm generations of the quantized model look sane.
5159
print("========== SAMPLE GENERATION ==============")
5260
messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
6674
output = model.generate(**inputs, max_new_tokens=100)
6775
print(processor.decode(output[0], skip_special_tokens=True))
6876
print("==========================================")
69-
70-
# Save to disk compressed.
71-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
72-
model.save_pretrained(SAVE_DIR, save_compressed=True)
73-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/mistral3_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ def data_collator(batch):
6060
data_collator=data_collator,
6161
)
6262

63+
# Save to disk compressed.
64+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
65+
model.save_pretrained(SAVE_DIR, save_compressed=True)
66+
processor.save_pretrained(SAVE_DIR)
67+
68+
# Load model after saving
69+
model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
70+
6371
# Confirm generations of the quantized model look sane.
6472
print("========== SAMPLE GENERATION ==============")
6573
messages = [
@@ -80,8 +88,3 @@ def data_collator(batch):
8088
output = model.generate(**inputs, max_new_tokens=100)
8189
print(processor.decode(output[0], skip_special_tokens=True))
8290
print("==========================================")
83-
84-
# Save to disk compressed.
85-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
86-
model.save_pretrained(SAVE_DIR, save_compressed=True)
87-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/mllama_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ def data_collator(batch):
4747
data_collator=data_collator,
4848
)
4949

50+
# Save to disk compressed.
51+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
52+
model.save_pretrained(SAVE_DIR, save_compressed=True)
53+
processor.save_pretrained(SAVE_DIR)
54+
55+
# Load model after saving
56+
model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
57+
5058
# Confirm generations of the quantized model look sane.
5159
print("========== SAMPLE GENERATION ==============")
5260
messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
6674
output = model.generate(**inputs, max_new_tokens=100)
6775
print(processor.decode(output[0], skip_special_tokens=True))
6876
print("==========================================")
69-
70-
# Save to disk compressed.
71-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
72-
model.save_pretrained(SAVE_DIR, save_compressed=True)
73-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/phi3_vision_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@ def data_collator(batch):
7878
ignore=["lm_head", "re:model.vision_embed_tokens.*"],
7979
)
8080

81+
# Save to disk compressed.
82+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
83+
model.save_pretrained(SAVE_DIR, save_compressed=True)
84+
processor.save_pretrained(SAVE_DIR)
85+
86+
# Load model after saving
87+
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
88+
8189
# Perform oneshot
8290
oneshot(
8391
model=model,
@@ -95,8 +103,3 @@ def data_collator(batch):
95103
output = model.generate(input_ids, max_new_tokens=20)
96104
print(processor.decode(output[0]))
97105
print("==========================================")
98-
99-
# Save to disk compressed.
100-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
101-
model.save_pretrained(SAVE_DIR, save_compressed=True)
102-
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/pixtral_example.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,14 @@ def data_collator(batch):
5353
data_collator=data_collator,
5454
)
5555

56+
# Save to disk compressed.
57+
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
58+
model.save_pretrained(SAVE_DIR, save_compressed=True)
59+
processor.save_pretrained(SAVE_DIR)
60+
61+
# Load model after saving
62+
model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
63+
5664
# Confirm generations of the quantized model look sane.
5765
print("========== SAMPLE GENERATION ==============")
5866
messages = [
@@ -72,8 +80,3 @@ def data_collator(batch):
7280
output = model.generate(**inputs, max_new_tokens=100)
7381
print(processor.decode(output[0], skip_special_tokens=True))
7482
print("==========================================")
75-
76-
# Save to disk compressed.
77-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
78-
model.save_pretrained(SAVE_DIR, save_compressed=True)
79-
processor.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)