Skip to content

Commit d0c6ed8

Browse files
authored
Merge branch 'main' into move-zensicial
2 parents 4ee7ff4 + a2433a9 commit d0c6ed8

File tree

15 files changed

+591
-205
lines changed

15 files changed

+591
-205
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""
2+
This script is adapted to use DDP functionality with AutoRound.
3+
run this with `torchrun --nproc_per_node=2 ddp_qwen3_example.py`
4+
or change nproc_per_node to your desired configuration
5+
6+
Example usage:
7+
torchrun --nproc_per_node=2 ddp_qwen3_example.py \
8+
--model Qwen/Qwen3-8B \
9+
--nsamples 128 \
10+
--iters 100 \
11+
--disable_torch_compile \
12+
--deterministic
13+
"""
14+
15+
import argparse
16+
import os
17+
18+
import torch
19+
import torch.distributed as dist
20+
from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
21+
from loguru import logger
22+
from transformers import AutoModelForCausalLM, AutoTokenizer
23+
24+
from llmcompressor import oneshot
25+
26+
27+
def fix_everything(seed=42):
28+
import random
29+
30+
import numpy as np
31+
32+
random.seed(seed)
33+
np.random.seed(seed)
34+
torch.manual_seed(seed)
35+
torch.cuda.manual_seed_all(seed)
36+
37+
38+
def config_deterministic():
39+
torch.use_deterministic_algorithms(True, warn_only=False)
40+
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
41+
fix_everything()
42+
43+
44+
if __name__ == "__main__":
45+
parser = argparse.ArgumentParser(
46+
description="AutoRound Quantization with DDP support"
47+
)
48+
parser.add_argument(
49+
"--model",
50+
type=str,
51+
default="Qwen/Qwen3-8B",
52+
help="Model name or path",
53+
)
54+
parser.add_argument(
55+
"--scheme",
56+
type=str,
57+
default="W4A16",
58+
help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
59+
)
60+
parser.add_argument("--iters", type=int, default=200, help="Number of iterations")
61+
parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
62+
parser.add_argument(
63+
"--disable_torch_compile",
64+
action="store_true",
65+
help="Disable torch.compile for model acceleration during quantization",
66+
)
67+
parser.add_argument(
68+
"--deterministic",
69+
action="store_true",
70+
help="Enable deterministic mode for reproducibility",
71+
)
72+
args = parser.parse_args()
73+
74+
if args.deterministic:
75+
config_deterministic()
76+
77+
model_id = args.model
78+
79+
###### DDP MODEL LOAD CHANGE #####
80+
init_dist()
81+
with load_offloaded_model():
82+
model = AutoModelForCausalLM.from_pretrained(
83+
model_id, dtype="auto", device_map="auto_offload"
84+
)
85+
##################################
86+
87+
tokenizer = AutoTokenizer.from_pretrained(model_id)
88+
89+
# Select calibration dataset.
90+
NUM_CALIBRATION_SAMPLES = args.nsamples
91+
MAX_SEQUENCE_LENGTH = 2048
92+
ITERS = args.iters
93+
94+
95+
# Get aligned calibration dataset.
96+
from auto_round.calib_dataset import get_dataset # noqa: E402
97+
98+
# Note: Make sure model are loaded before importing auto-round related code.
99+
# This requirement will be lifted once switching to new release of auto-round which
100+
# includes below fix:
101+
from llmcompressor.modifiers.autoround import AutoRoundModifier # noqa: E402
102+
103+
ds = get_dataset(
104+
tokenizer=tokenizer,
105+
seqlen=MAX_SEQUENCE_LENGTH,
106+
nsamples=NUM_CALIBRATION_SAMPLES,
107+
)
108+
109+
# Configure the quantization algorithm to run.
110+
# * quantize the weights to 4 bit with AutoRound with a group size 128
111+
recipe = AutoRoundModifier(
112+
targets="Linear",
113+
scheme=args.scheme,
114+
ignore=[
115+
"lm_head",
116+
"re:.*mlp.gate$",
117+
],
118+
iters=ITERS,
119+
enable_torch_compile=not args.disable_torch_compile,
120+
)
121+
122+
# Apply algorithms.
123+
oneshot(
124+
model=model,
125+
dataset=ds,
126+
recipe=recipe,
127+
max_seq_length=MAX_SEQUENCE_LENGTH,
128+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
129+
shuffle_calibration_samples=False,
130+
)
131+
132+
rank = dist.get_rank()
133+
logger.info(f"[Rank {rank}] Quantization completed")
134+
# Confirm generations of the quantized model look sane.
135+
logger.info("\n\n")
136+
logger.info("========== SAMPLE GENERATION ==============")
137+
dispatch_model(model)
138+
sample = tokenizer("Hello my name is", return_tensors="pt")
139+
sample = {key: value.to(model.device) for key, value in sample.items()}
140+
output = model.generate(**sample, max_new_tokens=100)
141+
logger.info(tokenizer.decode(output[0]))
142+
logger.info("==========================================\n\n")
143+
144+
logger.info("Saving...")
145+
# Save to disk compressed.
146+
SAVE_DIR = (
147+
model_id.rstrip("/").split("/")[-1]
148+
+ f"-{args.scheme}-AutoRound"
149+
+ f"-iters{args.iters}-nsamples{args.nsamples}"
150+
+ "-DDP"
151+
+ str(dist.get_world_size())
152+
)
153+
model.save_pretrained(SAVE_DIR, save_compressed=True)
154+
tokenizer.save_pretrained(SAVE_DIR)
155+
logger.info(f"Saved to {SAVE_DIR}")
156+
157+
dist.destroy_process_group()
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from datasets import load_dataset
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
from llmcompressor import oneshot
5+
from llmcompressor.modeling.glm_moe_dsa import CalibrationGlmMoeDsaMoE # noqa: F401
6+
from llmcompressor.modifiers.awq import AWQModifier
7+
8+
# Load the model
9+
model_id = "ZhipuAI/GLM-5"
10+
model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
11+
tokenizer = AutoTokenizer.from_pretrained(model_id)
12+
# MoE calibration is now handled automatically by the pipeline.
13+
# The `CalibrationGlmMoeDsaMoE` modules (from `llmcompressor.modeling.glm_moe_dsa`)
14+
# will be applied during calibration to enable proper expert calibration.
15+
# These permanently unpack the fused 3D expert weights into individual nn.Linear
16+
# layers for quantization target matching and vLLM compatibility.
17+
18+
# Select calibration dataset.
19+
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
20+
DATASET_SPLIT = "train_sft"
21+
22+
# Select number of samples. 512 samples is a good place to start.
23+
# Increasing the number of samples can improve accuracy.
24+
NUM_CALIBRATION_SAMPLES = 512
25+
MAX_SEQUENCE_LENGTH = 2048
26+
27+
# Load dataset and preprocess.
28+
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
29+
ds = ds.shuffle(seed=42)
30+
31+
32+
def preprocess(example):
33+
return {
34+
"text": tokenizer.apply_chat_template(
35+
example["messages"],
36+
tokenize=False,
37+
)
38+
}
39+
40+
41+
ds = ds.map(preprocess)
42+
43+
44+
# Tokenize inputs.
45+
def tokenize(sample):
46+
return tokenizer(
47+
sample["text"],
48+
padding=False,
49+
max_length=MAX_SEQUENCE_LENGTH,
50+
truncation=True,
51+
add_special_tokens=False,
52+
)
53+
54+
55+
ds = ds.map(tokenize, remove_columns=ds.column_names)
56+
57+
moe_ignores = [
58+
# Layers 0-2: Dense layers - ignore entire layers
59+
"model.layers.0.*",
60+
"model.layers.1.*",
61+
"model.layers.2.*",
62+
# Ignore the output head
63+
"lm_head",
64+
]
65+
66+
# Configure the quantization algorithm to run.
67+
# * quantize the weights to 4 bit with AWQ with a group size 128
68+
recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=moe_ignores)
69+
70+
# Apply algorithms.
71+
oneshot(
72+
model=model,
73+
dataset=ds,
74+
recipe=recipe,
75+
max_seq_length=MAX_SEQUENCE_LENGTH,
76+
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
77+
)
78+
79+
# Save to disk compressed.
80+
SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
81+
model.save_pretrained(SAVE_DIR, save_compressed=True)
82+
tokenizer.save_pretrained(SAVE_DIR)

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,9 @@ def localversion_func(version: ScmVersion) -> str:
129129
),
130130
("datasets>=4.0.0,<=4.6.0" if BUILD_TYPE == "release" else "datasets>=4.0.0"),
131131
(
132-
# auto-round 0.9.1 cannot work with accelerate <1.10.0
133-
"auto-round>=0.9.6,<=0.10.2"
132+
"auto-round>=0.10.2,<=0.10.2"
134133
if BUILD_TYPE == "release"
135-
else "auto-round>=0.9.6"
134+
else "auto-round>=0.10.2"
136135
),
137136
(
138137
"accelerate>=1.6.0,<=1.12.0"

src/llmcompressor/entrypoints/model_free/helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import Mapping, TypeVar
55

66
import torch
7-
from compressed_tensors.utils.match import _match_name
7+
from compressed_tensors.utils.match import match_name
88
from loguru import logger
99
from transformers.file_utils import CONFIG_NAME
1010

@@ -84,7 +84,7 @@ def natural_key(s: str) -> list[str | int]:
8484
for name in names:
8585
# match until we get a full set
8686
for target in targets:
87-
if _match_name(name, target):
87+
if match_name(name, target):
8888
if matches[target] is None:
8989
matches[target] = name
9090
else:

src/llmcompressor/entrypoints/model_free/process.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import torch
77
from compressed_tensors.quantization import QuantizationScheme
8-
from compressed_tensors.utils.match import _match_name
8+
from compressed_tensors.utils.match import match_name
99
from safetensors.torch import load_file, save_file
1010
from torch.nn import Module
1111

@@ -31,7 +31,7 @@ def iter_quantizable_tensors(
3131
for name in list(tensors.keys()):
3232
module_name, param_name = name.rsplit(".", 1)
3333
is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
34-
is_ignored = any(_match_name(module_name, ign) for ign in ignore)
34+
is_ignored = any(match_name(module_name, ign) for ign in ignore)
3535
if not is_linear_weight or is_ignored:
3636
continue
3737

src/llmcompressor/entrypoints/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import os
1111
from pathlib import PosixPath
1212

13-
from compressed_tensors.offload import from_accelerate
13+
from compressed_tensors.offload import from_accelerate, is_distributed
1414
from loguru import logger
1515
from transformers import (
1616
AutoConfig,
@@ -26,6 +26,7 @@
2626
RecipeArguments,
2727
)
2828
from llmcompressor.core import reset_session
29+
from llmcompressor.logger import configure_distributed_logger
2930
from llmcompressor.pytorch.model_load.helpers import parse_dtype
3031
from llmcompressor.transformers.compression.compressed_tensors_utils import (
3132
modify_save_pretrained,
@@ -52,6 +53,9 @@ def pre_process(
5253
Raises:
5354
FileNotFoundError: If the model or processor path is invalid.
5455
"""
56+
# Detect distributed, update logger
57+
if is_distributed():
58+
configure_distributed_logger()
5559

5660
# Initialize model
5761
if isinstance(model_args.model, (str, PosixPath)):

0 commit comments

Comments
 (0)