vllm-project
diff --git a/‎examples/autoround/ddp/ddp_qwen3_example.py‎
Lines changed: 157 additions & 0 deletions b/‎examples/autoround/ddp/ddp_qwen3_example.py‎
Lines changed: 157 additions & 0 deletions
diff --git a/‎examples/quantizing_moe/glm5_example.py‎
Lines changed: 82 additions & 0 deletions b/‎examples/quantizing_moe/glm5_example.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 3 deletions b/‎setup.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 2 additions & 2 deletions b/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llmcompressor/entrypoints/model_free/process.py‎
Lines changed: 2 additions & 2 deletions b/‎src/llmcompressor/entrypoints/model_free/process.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llmcompressor/entrypoints/utils.py‎
Lines changed: 5 additions & 1 deletion b/‎src/llmcompressor/entrypoints/utils.py‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,157 @@
+"""
+This script is adapted to use DDP functionality with AutoRound.
+run this with `torchrun --nproc_per_node=2 ddp_qwen3_example.py`
+or change nproc_per_node to your desired configuration
+
+Example usage:
+torchrun --nproc_per_node=2 ddp_qwen3_example.py \
+    --model Qwen/Qwen3-8B \
+    --nsamples 128 \
+    --iters 100 \
+    --disable_torch_compile \
+    --deterministic
+"""
+
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+
+
+def fix_everything(seed=42):
+    import random
+
+    import numpy as np
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def config_deterministic():
+    torch.use_deterministic_algorithms(True, warn_only=False)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    fix_everything()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="AutoRound Quantization with DDP support"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Qwen/Qwen3-8B",
+        help="Model name or path",
+    )
+    parser.add_argument(
+        "--scheme",
+        type=str,
+        default="W4A16",
+        help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
+    )
+    parser.add_argument("--iters", type=int, default=200, help="Number of iterations")
+    parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
+    parser.add_argument(
+        "--disable_torch_compile",
+        action="store_true",
+        help="Disable torch.compile for model acceleration during quantization",
+    )
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Enable deterministic mode for reproducibility",
+    )
+    args = parser.parse_args()
+
+if args.deterministic:
+    config_deterministic()
+
+model_id = args.model
+
+###### DDP MODEL LOAD CHANGE #####
+init_dist()
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, dtype="auto", device_map="auto_offload"
+    )
+##################################
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = args.nsamples
+MAX_SEQUENCE_LENGTH = 2048
+ITERS = args.iters
+
+
+# Get aligned calibration dataset.
+from auto_round.calib_dataset import get_dataset  # noqa: E402
+
+# Note: Make sure model are loaded before importing auto-round related code.
+# This requirement will be lifted once switching to new release of auto-round which
+# includes below fix:
+from llmcompressor.modifiers.autoround import AutoRoundModifier  # noqa: E402
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
+recipe = AutoRoundModifier(
+    targets="Linear",
+    scheme=args.scheme,
+    ignore=[
+        "lm_head",
+        "re:.*mlp.gate$",
+    ],
+    iters=ITERS,
+    enable_torch_compile=not args.disable_torch_compile,
+)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    shuffle_calibration_samples=False,
+)
+
+rank = dist.get_rank()
+logger.info(f"[Rank {rank}] Quantization completed")
+# Confirm generations of the quantized model look sane.
+logger.info("\n\n")
+logger.info("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+logger.info(tokenizer.decode(output[0]))
+logger.info("==========================================\n\n")
+
+logger.info("Saving...")
+# Save to disk compressed.
+SAVE_DIR = (
+    model_id.rstrip("/").split("/")[-1]
+    + f"-{args.scheme}-AutoRound"
+    + f"-iters{args.iters}-nsamples{args.nsamples}"
+    + "-DDP"
+    + str(dist.get_world_size())
+)
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+logger.info(f"Saved to {SAVE_DIR}")
+
+dist.destroy_process_group()
@@ -0,0 +1,82 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modeling.glm_moe_dsa import CalibrationGlmMoeDsaMoE  # noqa: F401
+from llmcompressor.modifiers.awq import AWQModifier
+
+# Load the model
+model_id = "ZhipuAI/GLM-5"
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# MoE calibration is now handled automatically by the pipeline.
+# The `CalibrationGlmMoeDsaMoE` modules (from `llmcompressor.modeling.glm_moe_dsa`)
+# will be applied during calibration to enable proper expert calibration.
+# These permanently unpack the fused 3D expert weights into individual nn.Linear
+# layers for quantization target matching and vLLM compatibility.
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+moe_ignores = [
+    # Layers 0-2: Dense layers - ignore entire layers
+    "model.layers.0.*",
+    "model.layers.1.*",
+    "model.layers.2.*",
+    # Ignore the output head
+    "lm_head",
+]
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AWQ with a group size 128
+recipe = AWQModifier(targets="Linear", scheme="W4A16", ignore=moe_ignores)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -129,10 +129,9 @@ def localversion_func(version: ScmVersion) -> str:
         ),
         ("datasets>=4.0.0,<=4.6.0" if BUILD_TYPE == "release" else "datasets>=4.0.0"),
         (
-            # auto-round 0.9.1 cannot work with accelerate <1.10.0
-            "auto-round>=0.9.6,<=0.10.2"
+            "auto-round>=0.10.2,<=0.10.2"
             if BUILD_TYPE == "release"
-            else "auto-round>=0.9.6"
+            else "auto-round>=0.10.2"
         ),
         (
             "accelerate>=1.6.0,<=1.12.0"
 
@@ -4,7 +4,7 @@
 from typing import Mapping, TypeVar
 
 import torch
-from compressed_tensors.utils.match import _match_name
+from compressed_tensors.utils.match import match_name
 from loguru import logger
 from transformers.file_utils import CONFIG_NAME
 
@@ -84,7 +84,7 @@ def natural_key(s: str) -> list[str | int]:
     for name in names:
         # match until we get a full set
         for target in targets:
-            if _match_name(name, target):
+            if match_name(name, target):
                 if matches[target] is None:
                     matches[target] = name
                 else:
 
@@ -5,7 +5,7 @@
 
 import torch
 from compressed_tensors.quantization import QuantizationScheme
-from compressed_tensors.utils.match import _match_name
+from compressed_tensors.utils.match import match_name
 from safetensors.torch import load_file, save_file
 from torch.nn import Module
 
@@ -31,7 +31,7 @@ def iter_quantizable_tensors(
     for name in list(tensors.keys()):
         module_name, param_name = name.rsplit(".", 1)
         is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
-        is_ignored = any(_match_name(module_name, ign) for ign in ignore)
+        is_ignored = any(match_name(module_name, ign) for ign in ignore)
         if not is_linear_weight or is_ignored:
             continue
 
 
@@ -10,7 +10,7 @@
 import os
 from pathlib import PosixPath
 
-from compressed_tensors.offload import from_accelerate
+from compressed_tensors.offload import from_accelerate, is_distributed
 from loguru import logger
 from transformers import (
     AutoConfig,
@@ -26,6 +26,7 @@
     RecipeArguments,
 )
 from llmcompressor.core import reset_session
+from llmcompressor.logger import configure_distributed_logger
 from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.compression.compressed_tensors_utils import (
     modify_save_pretrained,
@@ -52,6 +53,9 @@ def pre_process(
     Raises:
         FileNotFoundError: If the model or processor path is invalid.
     """
+    # Detect distributed, update logger
+    if is_distributed():
+        configure_distributed_logger()
 
     # Initialize model
     if isinstance(model_args.model, (str, PosixPath)):
Original file line number	Diff line number	Diff line change
`@@ -129,10 +129,9 @@ def localversion_func(version: ScmVersion) -> str:`
`129`	`129`	`),`
`130`	`130`	`("datasets>=4.0.0,<=4.6.0" if BUILD_TYPE == "release" else "datasets>=4.0.0"),`
`131`	`131`	`(`
`132`		`- # auto-round 0.9.1 cannot work with accelerate <1.10.0`
`133`		`- "auto-round>=0.9.6,<=0.10.2"`
	`132`	`+ "auto-round>=0.10.2,<=0.10.2"`
`134`	`133`	`if BUILD_TYPE == "release"`
`135`		`- else "auto-round>=0.9.6"`
	`134`	`+ else "auto-round>=0.10.2"`
`136`	`135`	`),`
`137`	`136`	`(`
`138`	`137`	`"accelerate>=1.6.0,<=1.12.0"`