vllm-project
diff --git a/‎examples/awq/llama_example.py
Lines changed: 10 additions & 11 deletions b/‎examples/awq/llama_example.py
Lines changed: 10 additions & 11 deletions
diff --git a/‎examples/awq/qwen3_moe_example.py
Lines changed: 13 additions & 11 deletions b/‎examples/awq/qwen3_moe_example.py
Lines changed: 13 additions & 11 deletions
diff --git a/‎examples/compressed_inference/fp8_compressed_inference.py
Lines changed: 5 additions & 1 deletion b/‎examples/compressed_inference/fp8_compressed_inference.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/multimodal_audio/whisper_example.py
Lines changed: 11 additions & 11 deletions b/‎examples/multimodal_audio/whisper_example.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/llava_example.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/mistral3_example.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/mllama_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/mllama_example.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 7 additions & 8 deletions b/‎examples/multimodal_vision/phi3_vision_example.py
Lines changed: 7 additions & 8 deletions
@@ -5,10 +5,12 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -61,18 +63,15 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -3,11 +3,15 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "Qwen/Qwen3-30B-A3B"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -65,18 +69,16 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -19,7 +19,11 @@
     "def fibonacci(n):",
 ]
 
-compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto")
+compressed_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_STUB,
+    torch_dtype="auto",
+    device_map="cuda:0",
+)
 
 # tokenize the sample data
 tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
 
@@ -4,12 +4,14 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Select model and load it.
-model_id = "openai/whisper-large-v3"
-model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
+MODEL_ID = "openai/whisper-large-v3"
+
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
 model.config.forced_decoder_ids = None
-processor = WhisperProcessor.from_pretrained(model_id)
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
 
 # Configure processor the dataset task.
 processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -83,17 +85,10 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = WhisperForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 sample_features = next(iter(ds))["input_features"]
 sample_decoder_ids = [processor.tokenizer.prefix_tokens]
 sample_input = {
@@ -106,3 +101,8 @@ def data_collator(batch):
 # that's where you have a lot of windows in the south no actually that's passive solar
 # and passive solar is something that was developed and designed in the 1960s and 70s
 # and it was a great thing for what it was at the time but it's not a passive house
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
@@ -46,16 +47,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Gemma3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100, disable_compile=True)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -6,6 +6,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
@@ -92,16 +93,9 @@ def tokenize(sample):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Idefics3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -119,3 +113,8 @@ def tokenize(sample):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
@@ -47,16 +48,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = LlavaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -8,6 +8,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@@ -60,16 +61,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = Mistral3ForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -88,3 +82,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -5,6 +5,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -47,16 +48,9 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = MllamaForConditionalGeneration.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 messages = [
     {
         "role": "user",
@@ -74,3 +68,8 @@ def data_collator(batch):
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
@@ -7,6 +7,7 @@
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.utils.dev import dispatch_for_generation
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"
@@ -78,14 +79,6 @@ def data_collator(batch):
     ignore=["lm_head", "re:model.vision_embed_tokens.*"],
 )
 
-# Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
-
-# Load model after saving
-model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
-
 # Perform oneshot
 oneshot(
     model=model,
@@ -99,7 +92,13 @@ def data_collator(batch):
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
 input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)
 print(processor.decode(output[0]))
 print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)