Skip to content

Commit 5bfb497

Browse files
author
Sara Adkins
authored
Add MoE and Compressed Inference Examples (#160)
* add deepseek example * renaming * run_compressed_example * update moe example * extra comment
1 parent 756516b commit 5bfb497

File tree

4 files changed

+141
-1
lines changed

4 files changed

+141
-1
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from transformers import AutoTokenizer
2+
3+
from llmcompressor.transformers import SparseAutoModelForCausalLM
4+
5+
"""
6+
This example covers how to load a quantized model in compressed mode. By default,
7+
SparseAutoModelForCausalLM will decompress the whole model on load resulting in no
8+
memory savings from quantization. By setting the `run_compressed` kwarg to True, the
9+
model will remain compressed in memory on load, saving memory during inference at the
10+
cost of increased runtime
11+
12+
During inference, each layer will be decompressed as needed before the forward pass.
13+
This saves memory as only a single layer is ever uncompressed at a time, but increases
14+
runtime as we need to decompress each layer before running the forward pass
15+
16+
"""
17+
18+
# any model with the "compressed-tensors" quant_method and "compressed"
19+
# quantization_status in the quantization config is supported
20+
MODEL_STUB = "nm-testing/tinyllama-fp8-dynamic-compressed"
21+
22+
SAMPLE_INPUT = [
23+
"I love quantization because",
24+
"What is the capital of France?",
25+
"def fibonacci(n):",
26+
]
27+
28+
# set run_compressed=True to enable running in compressed mode
29+
compressed_model = SparseAutoModelForCausalLM.from_pretrained(
30+
MODEL_STUB, torch_dtype="auto", device_map="cuda:0", run_compressed=True
31+
)
32+
33+
# tokenize the sample data
34+
tokenizer = AutoTokenizer.from_pretrained(MODEL_STUB)
35+
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
36+
compressed_model.device
37+
)
38+
39+
# run the compressed model and decode the output
40+
output = compressed_model.generate(**inputs, max_length=50)
41+
print("========== SAMPLE GENERATION ==============")
42+
text_output = tokenizer.batch_decode(output)
43+
for sample in text_output:
44+
print(sample)

examples/quantizing_moe_fp8/README.md renamed to examples/quantizing_moe/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ This example leverages `llm-compressor` and `compressed-tensors` to create an FP
2727
You can follow the detailed steps below or simply run the example script with:
2828

2929
```bash
30-
python examples/quantizing_moe_fp8/mixtral_moe.py
30+
python examples/quantizing_moe/mixtral_moe_fp8.py
3131
```
3232

3333
### Step 1: Select a Model, Dataset, and Recipe
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import torch
2+
from datasets import load_dataset
3+
from transformers import AutoTokenizer
4+
5+
from llmcompressor.modifiers.quantization import GPTQModifier
6+
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
7+
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
8+
9+
# select a Mixture of Experts model for quantization
10+
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
11+
12+
# adjust based off number of desired GPUs
13+
# if not enough memory is available, some layers will automatically be offlaoded to cpu
14+
device_map = calculate_offload_device_map(
15+
MODEL_ID,
16+
reserve_for_hessians=True,
17+
num_gpus=2,
18+
torch_dtype=torch.bfloat16,
19+
trust_remote_code=True,
20+
)
21+
22+
model = SparseAutoModelForCausalLM.from_pretrained(
23+
MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
24+
)
25+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
26+
27+
# Select calibration dataset.
28+
# its recommended to use more calibration samples for MoE models so each expert is hit
29+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
30+
DATASET_SPLIT = "train_sft"
31+
NUM_CALIBRATION_SAMPLES = 2048
32+
MAX_SEQUENCE_LENGTH = 2048
33+
34+
35+
# Load dataset and preprocess.
36+
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
37+
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
38+
39+
40+
def preprocess(example):
41+
return {
42+
"text": tokenizer.apply_chat_template(
43+
example["messages"],
44+
tokenize=False,
45+
)
46+
}
47+
48+
49+
ds = ds.map(preprocess)
50+
51+
52+
# Tokenize inputs.
53+
def tokenize(sample):
54+
return tokenizer(
55+
sample["text"],
56+
padding=False,
57+
max_length=MAX_SEQUENCE_LENGTH,
58+
truncation=True,
59+
add_special_tokens=False,
60+
)
61+
62+
63+
ds = ds.map(tokenize, remove_columns=ds.column_names)
64+
65+
# define a llmcompressor recipe for W416 quantization
66+
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
67+
# list so they remain at full precision
68+
recipe = [
69+
GPTQModifier(
70+
targets="Linear",
71+
scheme="W8A8",
72+
ignore=["lm_head", "re:.*mlp.gate$"],
73+
sequential_update=True,
74+
),
75+
]
76+
77+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
78+
79+
oneshot(
80+
model=model,
81+
dataset=ds,
82+
recipe=recipe,
83+
max_seq_length=MAX_SEQUENCE_LENGTH,
84+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
85+
save_compressed=True,
86+
output_dir=SAVE_DIR,
87+
)
88+
89+
90+
print("========== SAMPLE GENERATION ==============")
91+
SAMPLE_INPUT = ["I love quantization because"]
92+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
93+
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
94+
output = model.generate(**inputs, max_length=50)
95+
text_output = tokenizer.batch_decode(output)
96+
print(text_output)

0 commit comments

Comments
 (0)