vllm-project · dsikka · Feb 25, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/examples/autoround/quantization_w4a16/qwen3_example.py b/examples/autoround/quantization_w4a16/qwen3_example.py
@@ -1,5 +1,4 @@
 from auto_round.calib_dataset import get_dataset
-from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
@@ -50,16 +49,6 @@
     shuffle_calibration_samples=False,
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to(model.device) for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
 # Save to disk compressed.
 SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128-AutoRound"
 print(f"save to {SAVE_DIR}")

diff --git a/examples/awq/qwen3_next_example.py b/examples/awq/qwen3_next_example.py
@@ -1,4 +1,3 @@
-from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -67,17 +66,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-sym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)

diff --git a/examples/big_models_with_sequential_onloading/llama3.3_70b.py b/examples/big_models_with_sequential_onloading/llama3.3_70b.py
@@ -1,4 +1,3 @@
-from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -71,16 +70,6 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 )
 
-# Confirm generations of the quantized model look sane.
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to(model.device) for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
 # Save to disk compressed.
 SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W8A8"
 model.save_pretrained(SAVE_DIR, save_compressed=True)

diff --git a/examples/quantization_w4a4_fp4/qwen3_next_example.py b/examples/quantization_w4a4_fp4/qwen3_next_example.py
@@ -1,4 +1,3 @@
-from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -86,18 +85,6 @@ def tokenize(sample):
     moe_calibrate_all_experts=True,
 )
 
-
-print("\n\n")
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================\n\n")
-
-
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR, save_compressed=True)

diff --git a/examples/quantization_w4a4_fp4/qwen3_vl_moe_w4a4_fp4.py b/examples/quantization_w4a4_fp4/qwen3_vl_moe_w4a4_fp4.py
@@ -1,5 +1,4 @@
 import torch
-from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
 
@@ -83,14 +82,6 @@ def data_collator(batch):
     data_collator=data_collator,
 )
 
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
-output = model.generate(input_ids, max_new_tokens=20)
-print(processor.decode(output[0]))
-print("==========================================")
-
-
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
 model.save_pretrained(SAVE_DIR)

diff --git a/examples/quantization_w8a8_fp8/qwen3_next_example.py b/examples/quantization_w8a8_fp8/qwen3_next_example.py
@@ -1,4 +1,3 @@
-from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
@@ -31,16 +30,6 @@
 # Apply quantization.
 oneshot(model=model, recipe=recipe)
 
-# Confirm generations of the quantized model look sane.
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
-    model.device
-)
-output = model.generate(input_ids, max_new_tokens=20)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
 model.save_pretrained(SAVE_DIR, save_compressed=True)

diff --git a/examples/quantizing_moe/mixtral_example.py b/examples/quantizing_moe/mixtral_example.py
@@ -1,5 +1,4 @@
 import torch
-from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -72,14 +71,6 @@ def tokenize(sample):
     trust_remote_code_model=True,
 )
 
-print("========== SAMPLE GENERATION ==============")
-dispatch_model(model)
-sample = tokenizer("Hello my name is", return_tensors="pt")
-sample = {key: value.to(model.device) for key, value in sample.items()}
-output = model.generate(**sample, max_new_tokens=100)
-print(tokenizer.decode(output[0]))
-print("==========================================")
-
 # Save to disk in compressed-tensors format.
 SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
 model.save_pretrained(SAVE_DIR, save_compressed=True)