vllm-project
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/getting-started/compress.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/getting-started/compress.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/key-models/llama4/fp8-example.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/key-models/llama4/fp8-example.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_kv_cache/llama3_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_kv_cache/llama3_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_w4a16/llama3_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_w4a16/llama3_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_w4a16/qwen3_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_w4a16/qwen3_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_w4a4_fp4/llama3.1_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_w4a4_fp4/llama3.1_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_w8a8_fp8/llama4_dynamic_quant_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_w8a8_fp8/llama4_dynamic_quant_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/autoround/quantization_w8a8_fp8/llama4_static_quant_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/autoround/quantization_w8a8_fp8/llama4_static_quant_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/awq/fp8_block_llama_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/awq/fp8_block_llama_example.py‎
Lines changed: 2 additions & 2 deletions
@@ -107,11 +107,11 @@ Note that the model can be swapped for a local or remote HF-compatible checkpoin
 Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 
 ```python
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
@@ -134,7 +134,7 @@ oneshot(model=model, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )
 
@@ -24,7 +24,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
+from compressed_tensors.offload import dispatch_model
 
 MODEL_ID = "Qwen/Qwen3-30B-A3B"
 
@@ -47,7 +47,7 @@ oneshot(model=model, recipe=recipe)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )
 
@@ -14,11 +14,11 @@ Let's walk through the main steps of the quantization process:
 Load the model using `AutoModelForCausalLM`:
 
 ```python
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.utils import dispatch_for_generation
 
 MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 
@@ -53,7 +53,7 @@ oneshot(model=model, recipe=recipe)
 
 ```python
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )
 
@@ -1,8 +1,8 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -54,7 +54,7 @@
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 
@@ -1,9 +1,9 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -41,7 +41,7 @@
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 
@@ -1,9 +1,9 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "Qwen/Qwen3-235B-A22B"
@@ -53,7 +53,7 @@
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 sample = tokenizer("Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=100)
 
@@ -1,9 +1,9 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@@ -41,7 +41,7 @@
 
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )
 
@@ -1,9 +1,9 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoProcessor, Llama4ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -51,7 +51,7 @@
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 sample = processor(text="Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=1)
 
@@ -1,9 +1,9 @@
 from auto_round.calib_dataset import get_dataset
+from compressed_tensors.offload import dispatch_model
 from transformers import AutoProcessor, Llama4ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -51,7 +51,7 @@
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 sample = processor(text="Hello my name is", return_tensors="pt")
 sample = {key: value.to(model.device) for key, value in sample.items()}
 output = model.generate(**sample, max_new_tokens=1)
 
@@ -1,9 +1,9 @@
+from compressed_tensors.offload import dispatch_model
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.awq import AWQModifier
-from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -67,7 +67,7 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-dispatch_for_generation(model)
+dispatch_model(model)
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
     model.device
 )