vllm-project
diff --git a/‎examples/awq/llama_example.py
Lines changed: 4 additions & 6 deletions b/‎examples/awq/llama_example.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/awq/qwen3_moe_example.py
Lines changed: 4 additions & 7 deletions b/‎examples/awq/qwen3_moe_example.py
Lines changed: 4 additions & 7 deletions
diff --git a/‎examples/big_models_with_accelerate/README.md
Lines changed: 0 additions & 95 deletions b/‎examples/big_models_with_accelerate/README.md
Lines changed: 0 additions & 95 deletions
diff --git a/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 0 additions & 26 deletions b/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 0 additions & 26 deletions
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 0 additions & 81 deletions b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 0 additions & 81 deletions
diff --git a/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 0 additions & 78 deletions b/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 0 additions & 78 deletions
diff --git a/‎examples/compressed_inference/fp8_compressed_inference.py
Lines changed: 1 addition & 5 deletions b/‎examples/compressed_inference/fp8_compressed_inference.py
Lines changed: 1 addition & 5 deletions
diff --git a/‎examples/multimodal_audio/whisper_example.py
Lines changed: 4 additions & 9 deletions b/‎examples/multimodal_audio/whisper_example.py
Lines changed: 4 additions & 9 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 1 addition & 3 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 3 deletions b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 3 deletions
@@ -5,12 +5,10 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -72,6 +70,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+SAVE_DIR = model_id.split("/")[-1] + "-awq-asym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -5,12 +5,9 @@
 from llmcompressor.modifiers.awq import AWQModifier
 
 # Select model and load it.
-MODEL_ID = "Qwen/Qwen3-30B-A3B"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto"
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_id = "Qwen/Qwen3-30B-A3B"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
 # Select calibration dataset.
 DATASET_ID = "mit-han-lab/pile-val-backup"
@@ -77,6 +74,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+SAVE_DIR = model_id.split("/")[-1] + "-awq-sym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -19,11 +19,7 @@
     "def fibonacci(n):",
 ]
 
-compressed_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_STUB,
-    torch_dtype="auto",
-    device_map="cuda:0",
-)
+compressed_model = AutoModelForCausalLM.from_pretrained(MODEL_STUB, torch_dtype="auto")
 
 # tokenize the sample data
 tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
 
@@ -6,15 +6,10 @@
 from llmcompressor.modifiers.quantization import GPTQModifier
 
 # Select model and load it.
-MODEL_ID = "openai/whisper-large-v3"
-
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    torch_dtype="auto",
-)
+model_id = "openai/whisper-large-v3"
+model = WhisperForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 model.config.forced_decoder_ids = None
-processor = WhisperProcessor.from_pretrained(MODEL_ID)
+processor = WhisperProcessor.from_pretrained(model_id)
 
 # Configure processor the dataset task.
 processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
@@ -106,6 +101,6 @@ def data_collator(batch):
 # and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
@@ -8,9 +8,7 @@
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = Gemma3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
 
@@ -9,9 +9,7 @@
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
-model = Idefics3ForConditionalGeneration.from_pretrained(
-    model_id, device_map="auto", torch_dtype="auto"
-)
+model = Idefics3ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments