Skip to content

Commit 2d87993

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading
2 parents 96476fe + 8b2c612 commit 2d87993

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+273
-86
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ quality:
2626
@echo "Running python quality checks";
2727
ruff check $(CHECKDIRS);
2828
isort --check-only $(CHECKDIRS);
29-
flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
29+
flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
3030

3131
# style the code according to accepted standards for the repo
3232
style:
3333
@echo "Running python styling";
3434
ruff format $(CHECKDIRS);
3535
isort $(CHECKDIRS);
36-
flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
36+
flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
3737

3838
# run tests for the repo
3939
test:

examples/awq/llama_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,6 @@ def tokenize(sample):
7070
print("==========================================\n\n")
7171

7272
# Save to disk compressed.
73-
SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
73+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
7474
model.save_pretrained(SAVE_DIR, save_compressed=True)
7575
tokenizer.save_pretrained(SAVE_DIR)

examples/awq/qwen3_moe_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,6 @@ def tokenize(sample):
7777
print("==========================================\n\n")
7878

7979
# Save to disk compressed.
80-
SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
80+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-sym"
8181
model.save_pretrained(SAVE_DIR, save_compressed=True)
8282
tokenizer.save_pretrained(SAVE_DIR)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
3+
from llmcompressor import oneshot
4+
from llmcompressor.modifiers.quantization import QuantizationModifier
5+
6+
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
7+
OUTPUT_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
8+
9+
# Load model
10+
# Note: device_map="auto" will offload to CPU if not enough space on GPU.
11+
model = AutoModelForCausalLM.from_pretrained(
12+
MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
13+
)
14+
15+
# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
16+
recipe = QuantizationModifier(
17+
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
18+
)
19+
20+
# Apply quantization and save in `compressed-tensors` format.
21+
oneshot(
22+
model=model,
23+
recipe=recipe,
24+
tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
25+
output_dir=OUTPUT_DIR,
26+
)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import torch
2+
from datasets import load_dataset
3+
from transformers import AutoModelForCausalLM, AutoTokenizer
4+
5+
from llmcompressor import oneshot
6+
from llmcompressor.modifiers.quantization import GPTQModifier
7+
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
8+
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
9+
10+
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
11+
12+
# adjust based off number of desired GPUs
13+
# reserve_for_hessians=True reserves memory which is required by
14+
# GPTQModifier and SparseGPTModifier
15+
device_map = calculate_offload_device_map(
16+
MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
17+
)
18+
19+
model = AutoModelForCausalLM.from_pretrained(
20+
MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
21+
)
22+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
23+
24+
# Select calibration dataset.
25+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
26+
DATASET_SPLIT = "train_sft"
27+
NUM_CALIBRATION_SAMPLES = 512
28+
MAX_SEQUENCE_LENGTH = 2048
29+
30+
31+
# Load dataset and preprocess.
32+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
33+
ds = ds.shuffle(seed=42)
34+
35+
36+
def preprocess(example):
37+
return {
38+
"text": tokenizer.apply_chat_template(
39+
example["messages"],
40+
tokenize=False,
41+
)
42+
}
43+
44+
45+
ds = ds.map(preprocess)
46+
47+
48+
# Tokenize inputs.
49+
def tokenize(sample):
50+
return tokenizer(
51+
sample["text"],
52+
padding=False,
53+
max_length=MAX_SEQUENCE_LENGTH,
54+
truncation=True,
55+
add_special_tokens=False,
56+
)
57+
58+
59+
ds = ds.map(tokenize, remove_columns=ds.column_names)
60+
61+
# define a llmcompressor recipe for W8A8 quantization
62+
recipe = [
63+
SmoothQuantModifier(smoothing_strength=0.8),
64+
GPTQModifier(
65+
targets="Linear",
66+
scheme="W8A8",
67+
ignore=["lm_head"],
68+
),
69+
]
70+
71+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8"
72+
73+
oneshot(
74+
model=model,
75+
dataset=ds,
76+
recipe=recipe,
77+
max_seq_length=MAX_SEQUENCE_LENGTH,
78+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
79+
save_compressed=True,
80+
output_dir=SAVE_DIR,
81+
)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from datasets import load_dataset
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modifiers.quantization import GPTQModifier
6+
7+
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
8+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic"
9+
10+
# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
11+
model = AutoModelForCausalLM.from_pretrained(
12+
MODEL_ID,
13+
device_map="auto",
14+
torch_dtype="auto",
15+
trust_remote_code=True,
16+
)
17+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18+
19+
# 2) Prepare calibration dataset (in this case, we use ultrachat).
20+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
21+
DATASET_SPLIT = "train_sft"
22+
23+
# Select number of samples. 512 samples is a good place to start.
24+
# Increasing the number of samples can improve accuracy.
25+
NUM_CALIBRATION_SAMPLES = 512
26+
MAX_SEQUENCE_LENGTH = 1024
27+
28+
# Load dataset and preprocess.
29+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
30+
ds = ds.shuffle(seed=42)
31+
32+
33+
def preprocess(example):
34+
return {
35+
"text": tokenizer.apply_chat_template(
36+
example["messages"],
37+
tokenize=False,
38+
)
39+
}
40+
41+
42+
ds = ds.map(preprocess)
43+
44+
45+
# Tokenize inputs.
46+
def tokenize(sample):
47+
return tokenizer(
48+
sample["text"],
49+
padding=False,
50+
max_length=MAX_SEQUENCE_LENGTH,
51+
truncation=True,
52+
add_special_tokens=False,
53+
)
54+
55+
56+
ds = ds.map(tokenize, remove_columns=ds.column_names)
57+
58+
# 3) Configure algorithms. In this case, we:
59+
# * quantize the weights to int8 with GPTQ (static per channel)
60+
# * quantize the activations to int8 (dynamic per token)
61+
recipe = [
62+
GPTQModifier(
63+
targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1
64+
),
65+
]
66+
67+
# 4) Apply algorithms and save in `compressed-tensors` format.
68+
# if you encounter GPU out-of-memory issues, consider using an explicit
69+
# device map (see multi_gpus_int8_device_map.py)
70+
oneshot(
71+
model=model,
72+
tokenizer=tokenizer,
73+
dataset=ds,
74+
recipe=recipe,
75+
max_seq_length=MAX_SEQUENCE_LENGTH,
76+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
77+
output_dir=SAVE_DIR,
78+
)

examples/multimodal_audio/whisper_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,6 @@ def data_collator(batch):
103103
# and it was a great thing for what it was at the time but it's not a passive house
104104

105105
# Save to disk compressed.
106-
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
106+
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
107107
model.save_pretrained(SAVE_DIR, save_compressed=True)
108108
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/gemma3_example.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ def data_collator(batch):
3030
GPTQModifier(
3131
targets="Linear",
3232
scheme="W4A16",
33-
ignore=["re:*.lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
33+
ignore=[
34+
"lm_head",
35+
"re:model\.vision_tower.*",
36+
"re:model\.multi_modal_projector.*",
37+
],
3438
),
3539
]
3640

@@ -70,6 +74,6 @@ def data_collator(batch):
7074
print("==========================================")
7175

7276
# Save to disk compressed.
73-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
77+
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
7478
model.save_pretrained(SAVE_DIR, save_compressed=True)
7579
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/idefics3_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,6 @@ def tokenize(sample):
115115
print("==========================================")
116116

117117
# Save to disk compressed.
118-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
118+
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
119119
model.save_pretrained(SAVE_DIR, save_compressed=True)
120120
processor.save_pretrained(SAVE_DIR)

examples/multimodal_vision/llava_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,6 @@ def data_collator(batch):
7070
print("==========================================")
7171

7272
# Save to disk compressed.
73-
SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
73+
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
7474
model.save_pretrained(SAVE_DIR, save_compressed=True)
7575
processor.save_pretrained(SAVE_DIR)

0 commit comments

Comments
 (0)