Skip to content

Commit 36c30ee

Browse files
authored
[Examples] Deprecate dispatch_for_generation in favor of dispatch_model (#2376)
## Purpose ## * Start using `dispatch_model` as a primitive instead of `dispatch_for_generation`, which doesn't add anything but indirection ## Changes ## * Find and replace `dispatch_for_generation` -> `dispatch_model` * Add deprecation warning to `dispatch_for_generation` --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
1 parent ef70f43 commit 36c30ee

File tree

79 files changed

+161
-160
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+161
-160
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,11 @@ Note that the model can be swapped for a local or remote HF-compatible checkpoin
107107
Quantization is applied by selecting an algorithm and calling the `oneshot` API.
108108

109109
```python
110+
from compressed_tensors.offload import dispatch_model
110111
from transformers import AutoModelForCausalLM, AutoTokenizer
111112

112113
from llmcompressor import oneshot
113114
from llmcompressor.modifiers.quantization import QuantizationModifier
114-
from llmcompressor.utils import dispatch_for_generation
115115

116116
MODEL_ID = "Qwen/Qwen3-30B-A3B"
117117

@@ -134,7 +134,7 @@ oneshot(model=model, recipe=recipe)
134134

135135
# Confirm generations of the quantized model look sane.
136136
print("========== SAMPLE GENERATION ==============")
137-
dispatch_for_generation(model)
137+
dispatch_model(model)
138138
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
139139
model.device
140140
)

docs/getting-started/compress.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
2424

2525
from llmcompressor import oneshot
2626
from llmcompressor.modifiers.quantization import QuantizationModifier
27-
from llmcompressor.utils import dispatch_for_generation
27+
from compressed_tensors.offload import dispatch_model
2828

2929
MODEL_ID = "Qwen/Qwen3-30B-A3B"
3030

@@ -47,7 +47,7 @@ oneshot(model=model, recipe=recipe)
4747

4848
# Confirm generations of the quantized model look sane.
4949
print("========== SAMPLE GENERATION ==============")
50-
dispatch_for_generation(model)
50+
dispatch_model(model)
5151
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
5252
model.device
5353
)

docs/key-models/llama4/fp8-example.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ Let's walk through the main steps of the quantization process:
1414
Load the model using `AutoModelForCausalLM`:
1515

1616
```python
17+
from compressed_tensors.offload import dispatch_model
1718
from transformers import AutoModelForCausalLM, AutoTokenizer
1819

1920
from llmcompressor import oneshot
2021
from llmcompressor.modifiers.quantization import QuantizationModifier
21-
from llmcompressor.utils import dispatch_for_generation
2222

2323
MODEL_ID = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
2424

@@ -53,7 +53,7 @@ oneshot(model=model, recipe=recipe)
5353

5454
```python
5555
print("========== SAMPLE GENERATION ==============")
56-
dispatch_for_generation(model)
56+
dispatch_model(model)
5757
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
5858
model.device
5959
)

examples/autoround/quantization_kv_cache/llama3_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoModelForCausalLM, AutoTokenizer
34

45
from llmcompressor import oneshot
5-
from llmcompressor.utils import dispatch_for_generation
66

77
# Select model and load it.
88
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -54,7 +54,7 @@
5454
# Confirm generations of the quantized model look sane.
5555
print("\n\n")
5656
print("========== SAMPLE GENERATION ==============")
57-
dispatch_for_generation(model)
57+
dispatch_model(model)
5858
sample = tokenizer("Hello my name is", return_tensors="pt")
5959
sample = {key: value.to(model.device) for key, value in sample.items()}
6060
output = model.generate(**sample, max_new_tokens=100)

examples/autoround/quantization_w4a16/llama3_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoModelForCausalLM, AutoTokenizer
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.autoround import AutoRoundModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -41,7 +41,7 @@
4141
# Confirm generations of the quantized model look sane.
4242
print("\n\n")
4343
print("========== SAMPLE GENERATION ==============")
44-
dispatch_for_generation(model)
44+
dispatch_model(model)
4545
sample = tokenizer("Hello my name is", return_tensors="pt")
4646
sample = {key: value.to(model.device) for key, value in sample.items()}
4747
output = model.generate(**sample, max_new_tokens=100)

examples/autoround/quantization_w4a16/qwen3_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoModelForCausalLM, AutoTokenizer
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.autoround import AutoRoundModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
model_id = "Qwen/Qwen3-235B-A22B"
@@ -53,7 +53,7 @@
5353
# Confirm generations of the quantized model look sane.
5454
print("\n\n")
5555
print("========== SAMPLE GENERATION ==============")
56-
dispatch_for_generation(model)
56+
dispatch_model(model)
5757
sample = tokenizer("Hello my name is", return_tensors="pt")
5858
sample = {key: value.to(model.device) for key, value in sample.items()}
5959
output = model.generate(**sample, max_new_tokens=100)

examples/autoround/quantization_w4a4_fp4/llama3.1_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoModelForCausalLM, AutoTokenizer
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.autoround import AutoRoundModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@@ -41,7 +41,7 @@
4141

4242
print("\n\n")
4343
print("========== SAMPLE GENERATION ==============")
44-
dispatch_for_generation(model)
44+
dispatch_model(model)
4545
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
4646
model.device
4747
)

examples/autoround/quantization_w8a8_fp8/llama4_dynamic_quant_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoProcessor, Llama4ForConditionalGeneration
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.autoround import AutoRoundModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -51,7 +51,7 @@
5151
# Confirm generations of the quantized model look sane.
5252
print("\n\n")
5353
print("========== SAMPLE GENERATION ==============")
54-
dispatch_for_generation(model)
54+
dispatch_model(model)
5555
sample = processor(text="Hello my name is", return_tensors="pt")
5656
sample = {key: value.to(model.device) for key, value in sample.items()}
5757
output = model.generate(**sample, max_new_tokens=1)

examples/autoround/quantization_w8a8_fp8/llama4_static_quant_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from auto_round.calib_dataset import get_dataset
2+
from compressed_tensors.offload import dispatch_model
23
from transformers import AutoProcessor, Llama4ForConditionalGeneration
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.autoround import AutoRoundModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -51,7 +51,7 @@
5151
# Confirm generations of the quantized model look sane.
5252
print("\n\n")
5353
print("========== SAMPLE GENERATION ==============")
54-
dispatch_for_generation(model)
54+
dispatch_model(model)
5555
sample = processor(text="Hello my name is", return_tensors="pt")
5656
sample = {key: value.to(model.device) for key, value in sample.items()}
5757
output = model.generate(**sample, max_new_tokens=1)

examples/awq/fp8_block_llama_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1+
from compressed_tensors.offload import dispatch_model
12
from datasets import load_dataset
23
from transformers import AutoModelForCausalLM, AutoTokenizer
34

45
from llmcompressor import oneshot
56
from llmcompressor.modifiers.awq import AWQModifier
6-
from llmcompressor.utils import dispatch_for_generation
77

88
# Select model and load it.
99
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
@@ -67,7 +67,7 @@ def tokenize(sample):
6767
# Confirm generations of the quantized model look sane.
6868
print("\n\n")
6969
print("========== SAMPLE GENERATION ==============")
70-
dispatch_for_generation(model)
70+
dispatch_model(model)
7171
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
7272
model.device
7373
)

0 commit comments

Comments
 (0)