vllm-project
diff --git a/‎examples/awq/llama_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/awq/llama_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/awq/qwen3_moe_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/awq/qwen3_moe_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_audio/whisper_example.py
Lines changed: 8 additions & 6 deletions b/‎examples/multimodal_audio/whisper_example.py
Lines changed: 8 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/llava_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/mllama_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/mllama_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 8 additions & 5 deletions b/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 8 additions & 5 deletions
@@ -61,15 +61,18 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
 
@@ -65,15 +65,18 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
 
@@ -83,6 +83,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
@@ -92,15 +100,9 @@ def data_collator(batch):
     "input_features": torch.tensor(sample_features).to(model.device),
     "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
 }
-
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")
 # that's where you have a lot of windows in the south no actually that's passive solar
 # and passive solar is something that was developed and designed in the 1960s and 70s
 # and it was a great thing for what it was at the time but it's not a passive house
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -46,6 +46,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -65,8 +73,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -92,6 +92,14 @@ def tokenize(sample):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -111,8 +119,3 @@ def tokenize(sample):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -47,6 +47,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -60,6 +60,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -80,8 +88,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -47,6 +47,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -66,8 +74,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -78,6 +78,14 @@ def data_collator(batch):
     ignore=["lm_head", "re:model.vision_embed_tokens.*"],
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Perform oneshot
 oneshot(
     model=model,
@@ -95,8 +103,3 @@ def data_collator(batch):
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
 
@@ -53,6 +53,14 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
+
+# Load model after saving
+model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
+
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
 messages = [
@@ -72,8 +80,3 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
-
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)