[Distributed][SmoothQuant] Add distributed activation scale reduction (#2180) (#2471)

dzhengAP · David Zheng · web-flow · commit a3d2a3f1cda5 · 2026-03-18T17:13:56.000-04:00
## Summary Implements distributed support for `SmoothQuantModifier` as part of the weight-parallel optimization tracked in #2180, assigned to @dzhengAP. ## What this PR does In a distributed calibration run, each rank observes a disjoint partition of the calibration dataset. Activation statistics (per-channel min/max) are collected locally via forward hooks. Before smoothing scales are computed, `_reduce_activation_scales()` all-reduces those statistics across all ranks so every rank has the global activation profile, then each rank independently computes identical smoothing scales (cheap op, no weight broadcast needed). This follows the AWQ strategy described in #2180. Single-GPU behavior is completely unchanged — all new code is guarded by `is_distributed()`. ## Changes - `_reduce_activation_scales()`: all-reduces `min/max_channel_vals` across ranks using async MIN/MAX collectives batched with `wait_for_comms` - `_apply_smoothing()`: calls `_reduce_activation_scales()` as first step - Unit tests: 5 mock-based tests verifying call contract (no GPU needed) - DDP example: `examples/quantization_w8a8_int8/smoothquant_ddp_example.py` - Multi-GPU integration tests verifying weight equivalence vs single-GPU ## Test results - Unit tests: all 5 passed (`pytest -m unit`) - DDP example: ran successfully on 2x V100 32GB, both ranks completed in ~698s, peak GPU mem 1.66 GB per rank ## Distributed Speedup Benchmarks Model: Qwen/Qwen2-7B-Instruct, 512 calibration samples, 4x V100 32GB | GPUs | Total Time | Peak GPU Mem | Speedup | |------|-----------|-------------|---------| | 1 GPU | 94.1 min | 8.93 GB | 1.00x | | 2 GPU | 58.7 min | 7.06 GB | 1.60x | | 4 GPU | 28.7 min | 7.06 GB | 3.28x | Benchmark script: `examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py` - ruff: all checks passed Closes part of #2180 cc @kylesayrs --------- Signed-off-by: David Zheng <dzheng@apple.com> Signed-off-by: David Zheng <dqzheng1996@gmail.com> Co-authored-by: David Zheng <dzheng@apple.com>
diff --git a/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py b/examples/quantization_w8a8_int8/benchmark_smoothquant_ddp.py
@@ -0,0 +1,152 @@
+"""
+Benchmark: Single-GPU vs Multi-GPU DDP SmoothQuant calibration time.
+
+Usage:
+    # 1 GPU
+    python benchmark_smoothquant_ddp.py --num_gpus 1
+
+    # 2 GPU
+    torchrun --standalone --nproc_per_node=2 benchmark_smoothquant_ddp.py --num_gpus 2
+
+    # 4 GPU
+    torchrun --standalone --nproc_per_node=4 benchmark_smoothquant_ddp.py --num_gpus 4
+"""
+
+import argparse
+import time
+
+import torch
+import torch.distributed as dist
+from compressed_tensors.offload import init_dist, load_offloaded_model
+from datasets import load_dataset
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.gptq import GPTQModifier
+from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+MODEL_ID = "Qwen/Qwen2-7B-Instruct"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+def get_rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    return 0
+
+
+def get_world_size():
+    if dist.is_initialized():
+        return dist.get_world_size()
+    return 1
+
+
+def main(num_gpus: int):
+    is_distributed = num_gpus > 1
+
+    # ------------------------------------------------------------------
+    # Init distributed if needed
+    # ------------------------------------------------------------------
+    if is_distributed:
+        init_dist()
+        with load_offloaded_model():
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_ID,
+                dtype="auto",
+                device_map="auto_offload",
+            )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            dtype="auto",
+            device_map="auto",
+        )
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+    # ------------------------------------------------------------------
+    # Dataset — each rank gets a disjoint slice
+    # ------------------------------------------------------------------
+    rank = get_rank()
+    world_size = get_world_size()
+
+    samples_per_rank = NUM_CALIBRATION_SAMPLES // world_size
+    start = samples_per_rank * rank
+    split = f"{DATASET_SPLIT}[{start}:{start + samples_per_rank}]"
+
+    ds = load_dataset(DATASET_ID, split=split)
+    ds = ds.shuffle(seed=42)
+
+    def preprocess(example):
+        return {
+            "text": tokenizer.apply_chat_template(
+                example["messages"],
+                tokenize=False,
+            )
+        }
+
+    def tokenize(sample):
+        return tokenizer(
+            sample["text"],
+            padding=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(preprocess)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+    # ------------------------------------------------------------------
+    # Recipe
+    # ------------------------------------------------------------------
+    recipe = [
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    ]
+
+    # ------------------------------------------------------------------
+    # Benchmark
+    # ------------------------------------------------------------------
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+
+    elapsed = time.time() - start_time
+    peak_mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
+
+    if rank == 0:
+        logger.info("=" * 60)
+        logger.info(f"BENCHMARK RESULTS — {world_size} GPU(s)")
+        logger.info("=" * 60)
+        logger.info(f"Model:           {MODEL_ID}")
+        logger.info(f"Calibration:     {NUM_CALIBRATION_SAMPLES} samples total")
+        logger.info(f"Samples/rank:    {samples_per_rank}")
+        logger.info(f"World size:      {world_size}")
+        logger.info(f"Total time:      {elapsed:.1f}s ({elapsed/60:.2f} min)")
+        logger.info(f"Peak GPU mem:    {peak_mem_gb:.2f} GB (rank 0)")
+        logger.info("=" * 60)
+
+    if is_distributed:
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_gpus", type=int, default=1)
+    args = parser.parse_args()
+    main(args.num_gpus)
diff --git a/examples/quantization_w8a8_int8/smoothquant_ddp_example.py b/examples/quantization_w8a8_int8/smoothquant_ddp_example.py
@@ -0,0 +1,142 @@
+"""
+Distributed SmoothQuant + GPTQ W8A8 quantization using Data-Parallel Calibration.
+
+Run with:
+    torchrun --standalone --nproc_per_node=NUM_GPUS smoothquant_ddp_example.py
+
+Each rank loads a disjoint partition of the calibration dataset.
+SmoothQuantModifier all-reduces per-channel activation statistics across ranks
+before computing smoothing scales (identical on every rank, no weight broadcast
+needed). GPTQModifier then applies distributed W8A8 quantization.
+
+This script intentionally mirrors the structure of
+examples/quantization_w4a16/llama3_ddp_example.py so it is easy to diff.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
+from datasets import load_dataset
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.datasets.utils import get_rank_partition
+from llmcompressor.modifiers.gptq import GPTQModifier
+from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+MODEL_ID = "Qwen/Qwen2-7B-Instruct"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# ---------------------------------------------------------------------------
+# DDP init + model load
+# ---------------------------------------------------------------------------
+init_dist()
+
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        dtype="auto",
+        device_map="auto_offload",
+    )
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# ---------------------------------------------------------------------------
+# Dataset: each rank gets a disjoint slice
+# ---------------------------------------------------------------------------
+ds = load_dataset(
+    DATASET_ID,
+    split=get_rank_partition(DATASET_SPLIT, NUM_CALIBRATION_SAMPLES),
+)
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(preprocess)
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# ---------------------------------------------------------------------------
+# Recipe: SmoothQuant (distributed-aware) + GPTQ W8A8
+# ---------------------------------------------------------------------------
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# ---------------------------------------------------------------------------
+# Run oneshot
+# ---------------------------------------------------------------------------
+torch.cuda.reset_peak_memory_stats()
+start_time = time.time()
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+elapsed = time.time() - start_time
+peak_mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
+
+rank = dist.get_rank()
+logger.info(
+    f"[Rank {rank}] Done in {elapsed:.1f}s | Peak GPU mem: {peak_mem_gb:.2f} GB"
+)
+
+# ---------------------------------------------------------------------------
+# Sample generation (rank 0 only)
+# ---------------------------------------------------------------------------
+# Sample generation (all ranks must participate)
+dist.barrier()
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {k: v.to(model.device) for k, v in sample.items()}
+output = model.generate(**sample, max_new_tokens=50)
+if rank == 0:
+    logger.info("\n========== SAMPLE GENERATION ==========")
+    logger.info(tokenizer.decode(output[0]))
+    logger.info("========================================\n")
+
+# ---------------------------------------------------------------------------
+# Save (rank 0 only — save_pretrained handles dist internally)
+# ---------------------------------------------------------------------------
+if rank == 0:
+    SAVE_DIR = (
+        MODEL_ID.rstrip("/").split("/")[-1]
+        + "-W8A8-SmoothQuant-DDP"
+        + str(dist.get_world_size())
+    )
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    logger.info(f"[Rank {rank}] Saved to {SAVE_DIR}")
+
+dist.destroy_process_group()
diff --git a/src/llmcompressor/modifiers/transform/smoothquant/base.py b/src/llmcompressor/modifiers/transform/smoothquant/base.py
@@ -2,7 +2,9 @@
 from typing import Callable
 
 import torch
+import torch.distributed as dist
 from compressed_tensors.offload import update_offload_parameter
+from compressed_tensors.offload.dist_utils import is_distributed
 from compressed_tensors.utils import match_modules_set, match_named_modules
 from loguru import logger
 from pydantic import ConfigDict, Field
@@ -15,6 +17,7 @@
     get_layer_mappings_from_architecture,
     handle_mapping_resolution_errors,
 )
+from llmcompressor.utils.dist import wait_for_comms
 from llmcompressor.utils.pytorch.module import get_module_to_name_dict
 
 MINIMUM_SMOOTHING_SCALE = 1e-5
@@ -290,6 +293,40 @@ def hook_fn(module, inp, out):
             layer = mapping.smooth_layer
             self.register_hook(layer, create_hook_fn(name), "forward")
 
+    def _reduce_activation_scales(self):
+        """
+        In a distributed setting, all-reduce the per-channel min/max activation
+        statistics collected by each rank during calibration.
+
+        Each rank observes a disjoint partition of the calibration dataset, so the
+        global channel-wise min/max must be gathered across all ranks before smoothing
+        scales are computed.  We use ``dist.all_reduce`` with MIN/MAX ops so that
+        every rank ends up with identical statistics and can independently compute the
+        same smoothing scales without an extra broadcast of the final scale tensors.
+
+        This is a no-op when not running in a distributed context.
+        """
+        if not is_distributed():
+            return
+
+        pending_comms = []
+        for layer_name, scale in self.scales_.items():
+            pending_comms.append(
+                dist.all_reduce(
+                    scale.min_channel_vals,
+                    op=dist.ReduceOp.MIN,
+                    async_op=True,
+                )
+            )
+            pending_comms.append(
+                dist.all_reduce(
+                    scale.max_channel_vals,
+                    op=dist.ReduceOp.MAX,
+                    async_op=True,
+                )
+            )
+        wait_for_comms(pending_comms)
+
     @torch.no_grad()
     def _apply_smoothing(self, model: Module):
         """
@@ -299,8 +336,16 @@ def _apply_smoothing(self, model: Module):
         Y = (Xdiag(scales)^(-1) * diag(scales)W) where W is the to_balance weights and
         X is the to_smooth weights
 
+        In a distributed setting, activation statistics are first all-reduced across
+        ranks so that every rank computes identical smoothing scales.  The scale
+        computation itself is duplicated across ranks (cheap), avoiding the need for
+        a broadcast of the final weight tensors.
+
         This modifies the weights of the model in-place.
         """
+        if is_distributed():
+            self._reduce_activation_scales()
+
         for mapping in self.resolved_mappings_:
             if mapping.smooth_name not in self.scales_:
                 continue
diff --git a/tests/llmcompressor/modifiers/transform/smoothquant/test_smoothquant_distributed.py b/tests/llmcompressor/modifiers/transform/smoothquant/test_smoothquant_distributed.py