Skip to content

Commit 6fdcdb1

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/sequential-onloading
2 parents fab6fe1 + 5375f18 commit 6fdcdb1

File tree

10 files changed

+59
-31
lines changed

10 files changed

+59
-31
lines changed

examples/quantization_w8a8_fp8/gemma2_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# 3) Apply quantization and save in compressed-tensors format.
2121
oneshot(model=model, recipe=recipe, tokenizer=tokenizer)
2222

23-
# Save to disk in compressed-tensors format.
23+
# 4) Save to disk in compressed-tensors format.
2424
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
2525
model.save_pretrained(SAVE_DIR, save_compressed=True)
2626
tokenizer.save_pretrained(SAVE_DIR)

examples/quantization_w8a8_fp8/llama3.2_vision_example.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,7 @@
2020
)
2121

2222
# Apply quantization.
23-
oneshot(
24-
model=model,
25-
recipe=recipe,
26-
)
23+
oneshot(model=model, recipe=recipe)
2724

2825
# Save to disk in compressed-tensors format.
2926
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"

examples/quantizing_moe/deepseek_moe_w4a16.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ def tokenize(sample):
5959
# list so they remain at full precision
6060
recipe = "deepseek_recipe_w4a16.yaml"
6161

62-
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
63-
64-
6562
oneshot(
6663
model=model,
6764
dataset=ds,
@@ -70,7 +67,6 @@ def tokenize(sample):
7067
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7168
save_compressed=True,
7269
trust_remote_code_model=True,
73-
output_dir=SAVE_DIR,
7470
)
7571

7672
# Confirm generations of the quantized model look sane.
@@ -87,6 +83,11 @@ def tokenize(sample):
8783
"deepseek models with transformers >= 4.48"
8884
)
8985

86+
# Save to disk in compressed-tensors format.
87+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16"
88+
model.save_pretrained(SAVE_DIR, save_compressed=True)
89+
tokenizer.save_pretrained(SAVE_DIR)
90+
9091

9192
# Run the model on vLLM
9293
try:

examples/quantizing_moe/deepseek_moe_w8a8_fp8.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,19 +66,20 @@ def tokenize(sample):
6666
),
6767
]
6868

69-
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
70-
7169
oneshot(
7270
model=model,
7371
dataset=ds,
7472
recipe=recipe,
7573
max_seq_length=MAX_SEQUENCE_LENGTH,
7674
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7775
trust_remote_code_model=True,
78-
save_compressed=True,
79-
output_dir=SAVE_DIR,
8076
)
8177

78+
# Save to disk in compressed-tensors format.
79+
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8"
80+
model.save_pretrained(SAVE_DIR, save_compressed=True)
81+
tokenizer.save_pretrained(SAVE_DIR)
82+
8283
# Load model after saving
8384
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
8485

examples/quantizing_moe/deepseek_moe_w8a8_int8.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,20 @@ def tokenize(sample):
6767
),
6868
]
6969

70-
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
71-
7270
oneshot(
7371
model=model,
7472
dataset=ds,
7573
recipe=recipe,
7674
max_seq_length=MAX_SEQUENCE_LENGTH,
7775
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7876
trust_remote_code_model=True,
79-
save_compressed=True,
80-
output_dir=SAVE_DIR,
8177
)
8278

79+
# Save to disk in compressed-tensors format.
80+
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8"
81+
model.save_pretrained(SAVE_DIR, save_compressed=True)
82+
tokenizer.save_pretrained(SAVE_DIR)
83+
8384
# Load model after saving
8485
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
8586

examples/quantizing_moe/mixtral_moe_w8a8_fp8.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,11 @@
1919
MAX_SEQ_LENGTH = 2048
2020
NUM_CALIBRATION_SAMPLES = 512
2121

22-
# Save location of quantized model
23-
SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
24-
SAVE_COMPRESSED = True
25-
22+
# Recipe
2623
layers_to_ignore: List[str] = [
2724
"lm_head",
2825
"re:.*block_sparse_moe.gate", # does not quantize well
2926
]
30-
3127
recipe = QuantizationModifier(scheme="FP8", targets="Linear", ignore=layers_to_ignore)
3228

3329

@@ -39,10 +35,13 @@
3935
recipe=recipe,
4036
max_seq_length=MAX_SEQ_LENGTH,
4137
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
42-
save_compressed=SAVE_COMPRESSED,
43-
output_dir=SAVE_DIR,
4438
)
4539

40+
# Save to disk in compressed-tensors format.
41+
SAVE_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
42+
model.save_pretrained(SAVE_DIR, save_compressed=True)
43+
tokenizer.save_pretrained(SAVE_DIR)
44+
4645
# Load model after saving
4746
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
4847

examples/quantizing_moe/qwen_moe_w4a16.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ def tokenize(sample):
5959
ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
6060
)
6161

62-
SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
63-
64-
6562
oneshot(
6663
model=model,
6764
dataset=ds,
@@ -70,9 +67,13 @@ def tokenize(sample):
7067
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
7168
save_compressed=True,
7269
trust_remote_code_model=True,
73-
output_dir=SAVE_DIR,
7470
)
7571

72+
# Save to disk in compressed-tensors format.
73+
SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
74+
model.save_pretrained(SAVE_DIR, save_compressed=True)
75+
tokenizer.save_pretrained(SAVE_DIR)
76+
7677
# Load model after saving
7778
model = AutoModelForCausalLM.from_pretrained(SAVE_DIR, device_map="auto")
7879

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Optional
44

55
import torch
6+
from accelerate.hooks import remove_hook_from_module
67
from compressed_tensors.utils import force_cpu_offload
78
from loguru import logger
89
from torch.utils.data import DataLoader
@@ -14,8 +15,6 @@
1415
from llmcompressor.entrypoints.utils import post_process, pre_process
1516
from llmcompressor.pipelines import CalibrationPipeline
1617

17-
from accelerate.hooks import remove_hook_from_module
18-
1918
__all__ = ["Oneshot", "oneshot"]
2019

2120

src/llmcompressor/entrypoints/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pathlib import PosixPath
44
from typing import Optional, Tuple
55

6+
from accelerate.hooks import remove_hook_from_module
67
from loguru import logger
78
from torch.nn import Module
89
from transformers import (
@@ -27,7 +28,6 @@
2728
)
2829
from llmcompressor.typing import Processor
2930
from llmcompressor.utils.fsdp.helpers import is_fsdp_model
30-
from accelerate.hooks import remove_hook_from_module
3131

3232

3333
def pre_process(model_args: "ModelArguments"):

src/llmcompressor/utils/module.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from typing import Callable, Union
2+
3+
import tqdm
4+
from torch.nn import Module
5+
6+
7+
def module_bfs(
8+
module: Module,
9+
func: Callable[[Module], Module],
10+
pre: bool = True,
11+
progress: Union[bool, tqdm.tqdm] = False,
12+
) -> Module:
13+
if progress is True:
14+
total = len(list(module.modules()))
15+
progress = tqdm.tqdm(total=total)
16+
17+
if pre:
18+
module = func(module)
19+
20+
for name, child in list(module.named_children()):
21+
module.add_module(name, module_bfs(child, func, pre, progress))
22+
23+
if not pre:
24+
module = func(module)
25+
26+
if isinstance(progress, tqdm.tqdm):
27+
progress.update(1)
28+
29+
return module

0 commit comments

Comments
 (0)