remove unneeded and test updates

HDCharles · yiliu30 · commit e670838cf9a1 · 2026-03-06T12:16:22.000Z
Summary

Signed-off-by: HDCharles &lt;charlesdavidhernandez@gmail.com&gt;
Signed-off-by: yiliu30 &lt;yi4.liu@intel.com&gt;
diff --git a/experimental/ddp/ddp_qwen3_example.py b/experimental/ddp/ddp_qwen3_example.py
@@ -1,34 +1,34 @@
-"""
-# AutoRound: https://github.com/intel/auto-round/tree/main
-CUDA_VISIBLE_DEVICES=0,1 python ddp_qwen3_example.py \
-    --model Qwen/Qwen3-8B \
-    --ddp \
-    --nsamples 128  \
-    --iters 200 \
-    --disable_torch_compile \
-    --deterministic \
-
-"""
-from loguru import logger
-from auto_round.calib_dataset import get_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from llmcompressor import oneshot
-from llmcompressor.modifiers.autoround import AutoRoundModifier
-from llmcompressor.utils import dispatch_for_generation
-
-# Select model and load it.
-model_id = "Qwen/Qwen3-8B"
+#############################################################################
+# This script is adapted to use DDP functionality with AutoRound.
+# run this with `torchrun --nproc_per_node=2 ddp_qwen3_example.py`
+# or change nproc_per_node to your desired configuration
+#
+# Example usage:
+# torchrun --nproc_per_node=2 ddp_qwen3_example.py \
+#     --model Qwen/Qwen3-8B \
+#     --nsamples 128 \
+#     --iters 200 \
+#     --disable_torch_compile \
+#     --deterministic
+#############################################################################
 
 import argparse
 import os
+import time
 
 import torch
 import torch.distributed as dist
-import torch.multiprocessing as mp
+from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
+from datasets import load_dataset
+from loguru import logger
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch.distributed as dist
+from llmcompressor import oneshot
+from llmcompressor.datasets.utils import get_rank_partition
+from llmcompressor.modifiers.autoround import AutoRoundModifier
 
 
-def fix_everything(seed = 42):
+def fix_everything(seed=42):
     import random
     import numpy as np
 
@@ -42,144 +42,16 @@ def config_deterministic():
     torch.use_deterministic_algorithms(True, warn_only=False)
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
     fix_everything()
-    
-
-
-def setup_ddp(rank, world_size):
-    """Initialize the distributed environment."""
-    os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR", "localhost")
-    os.environ["MASTER_PORT"] = os.environ.get("MASTER_PORT", "12356")
-
-    # Initialize process group
-    backend = "nccl" if torch.cuda.is_available() else "gloo"
-    dist.init_process_group(backend, rank=rank, world_size=world_size)
-    torch.cuda.set_device(rank)
-
-
-def cleanup_ddp():
-    """Clean up the distributed environment."""
-    if dist.is_initialized():
-        dist.destroy_process_group()
-
-
-def quantize_model(rank, world_size, args):
-    """
-    Quantize model on a specific GPU rank.
-
-    Args:
-        rank: GPU rank for this process
-        world_size: Total number of GPUs
-        args: Command line arguments
-    """
-    if args.deterministic:
-        config_deterministic()
-    logger.info(f"[Rank {rank}/{world_size}] Starting quantization")
-
-    # Setup DDP if using multiple GPUs
-    if world_size > 1:
-        setup_ddp(rank, world_size)
-
-    # Set device for this process
-    model_name = args.model_name
-    # device_map = "meta"
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, torch_dtype="auto", 
-        # device_map=device_map
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-    # Select calibration dataset.
-    NUM_CALIBRATION_SAMPLES = args.nsamples
-    MAX_SEQUENCE_LENGTH = 2048
-    ITERS = args.iters
-    # Get aligned calibration dataset.
-
-    ds = get_dataset(
-        tokenizer=tokenizer,
-        seqlen=MAX_SEQUENCE_LENGTH,
-        nsamples=NUM_CALIBRATION_SAMPLES,
-    )
-
-    # Configure the quantization algorithm to run.
-    #   * quantize the weights to 4 bit with AutoRound with a group size 128
-    recipe = AutoRoundModifier(
-        targets="Linear",
-        scheme=args.scheme,
-        ignore=[
-            "lm_head",
-            "re:.*mlp.gate$",
-        ],
-        iters=ITERS,
-        enable_torch_compile=not args.disable_torch_compile,
-    )
-
-    # Apply algorithms.
-    oneshot(
-        model=model,
-        dataset=ds,
-        recipe=recipe,
-        max_seq_length=MAX_SEQUENCE_LENGTH,
-        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-        shuffle_calibration_samples=False,
-    )
-
-    # Synchronize all processes
-    if world_size > 1:
-        dist.barrier()
-
-    logger.info(f"[Rank {rank}] Quantization completed")
-    if rank == 0:
-        # Confirm generations of the quantized model look sane.
-        logger.info("\n\n")
-        logger.info("========== SAMPLE GENERATION ==============")
-        dispatch_for_generation(model)
-        sample = tokenizer("Hello my name is", return_tensors="pt")
-        sample = {key: value.to(model.device) for key, value in sample.items()}
-        output = model.generate(**sample, max_new_tokens=100)
-        logger.info(tokenizer.decode(output[0]))
-        logger.info("==========================================\n\n")
-
-        # Save to disk compressed.
-        SAVE_DIR = (
-            model_name.rstrip("/").split("/")[-1]
-            + f"-{args.scheme}-AutoRound"
-            + f"-iters{args.iters}-nsamples{args.nsamples}"
-        )
-        logger.info(f"save to {SAVE_DIR}")
-        model.save_pretrained(SAVE_DIR, save_compressed=True)
-        tokenizer.save_pretrained(SAVE_DIR)
-    else:
-        # Other ranks just run quantization without saving
-        logger.info(f"[Rank {rank}] Running quantization (not saving)")
-
-    if world_size > 1:
-        cleanup_ddp()
-
-
-def main_spawn(args):
-    """Main function using mp.spawn for multi-GPU quantization."""
-    world_size = torch.cuda.device_count() if torch.cuda.is_available() else 1
-
-    logger.info(f"Starting DDP quantization with {world_size} GPUs")
-
-    mp.spawn(
-        quantize_model,
-        args=(world_size, args),
-        nprocs=world_size,
-        join=True,
-    )
-
-    logger.info("Quantization completed!")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="AutoRound Quantization with DDP support"
     )
     parser.add_argument(
-        "--model_name",
+        "--model",
         type=str,
-        default=model_id,
+        default="Qwen/Qwen3-8B",
         help="Model name or path",
     )
     parser.add_argument(
@@ -188,9 +60,8 @@ def main_spawn(args):
         default="W4A16",
         help="Quantization scheme (W4A16, MXFP8, MXFP4, etc.)",
     )
-    parser.add_argument("--iters", type=int, default=100, help="Number of iterations")
-    parser.add_argument("--nsamples", type=int, default=256, help="Number of samples")
-    parser.add_argument("--ddp", action="store_true", help="Enable DDP multi-GPU mode")
+    parser.add_argument("--iters", type=int, default=200, help="Number of iterations")
+    parser.add_argument("--nsamples", type=int, default=128, help="Number of samples")
     parser.add_argument(
         "--disable_torch_compile",
         action="store_true",
@@ -203,22 +74,79 @@ def main_spawn(args):
     )
     args = parser.parse_args()
 
-    # For backward compatibility with existing hardcoded values
-    model_name = args.model_name
-
-    # Parse scheme from string if needed
-    from auto_round import schemes as ar_schemes
-
-    scheme_map = {
-        "FP8_STATIC": ar_schemes.FP8_STATIC,
-        "MXFP8": ar_schemes.MXFP8,
-        "MXFP4": ar_schemes.MXFP4,
-    }
-    # scheme = scheme_map.get(args.scheme, args.scheme)
-
-    if args.ddp:
-        logger.info("Using mp.spawn mode for multi-GPU quantization")
-        main_spawn(args)
-    else:
-        logger.info("Using single-process quantization")
-        quantize_model(rank=0, world_size=1, args=args)
+if args.deterministic:
+    config_deterministic()
+
+model_id = args.model
+
+###### DDP MODEL LOAD CHANGE #####
+init_dist()
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, dtype="auto", device_map="auto_offload"
+    )
+##################################
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+# Select calibration dataset.
+NUM_CALIBRATION_SAMPLES = args.nsamples
+MAX_SEQUENCE_LENGTH = 2048
+ITERS = args.iters
+# Get aligned calibration dataset.
+
+ds = get_dataset(
+    tokenizer=tokenizer,
+    seqlen=MAX_SEQUENCE_LENGTH,
+    nsamples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with AutoRound with a group size 128
+recipe = AutoRoundModifier(
+    targets="Linear",
+    scheme=args.scheme,
+    ignore=[
+        "lm_head",
+        "re:.*mlp.gate$",
+    ],
+    iters=ITERS,
+    enable_torch_compile=not args.disable_torch_compile,
+)
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    shuffle_calibration_samples=False,
+)
+
+rank = dist.get_rank()
+logger.info(f"[Rank {rank}] Quantization completed")
+# Confirm generations of the quantized model look sane.
+logger.info("\n\n")
+logger.info("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+sample = tokenizer("Hello my name is", return_tensors="pt")
+sample = {key: value.to(model.device) for key, value in sample.items()}
+output = model.generate(**sample, max_new_tokens=100)
+logger.info(tokenizer.decode(output[0]))
+logger.info("==========================================\n\n")
+
+logger.info("Saving...")
+# Save to disk compressed.
+SAVE_DIR = (
+    model_id.rstrip("/").split("/")[-1]
+    + f"-{args.scheme}-AutoRound"
+    + f"-iters{args.iters}-nsamples{args.nsamples}"
+    + "-DDP"
+    + str(dist.get_world_size())
+)
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+logger.info(f"Saved to {SAVE_DIR}")
+
+dist.destroy_process_group()
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
@@ -254,34 +254,11 @@ def _get_partition_start_end(
     return start, end
 
 
-
 def _make_sampler(args: DatasetArguments, dataset: Dataset) -> Sampler:
     num_samples = args.num_calibration_samples
     shuffle = args.shuffle_calibration_samples
     batch_size = args.batch_size
 
-    if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
-        from torch.utils.data.distributed import DistributedSampler
-
-        distributed_sampler = DistributedSampler(
-            dataset,
-            num_replicas=torch.distributed.get_world_size(),
-            rank=torch.distributed.get_rank(),
-            shuffle=shuffle,
-        )
-        logger.warning("Using DistributedSampler for DDP training.")
-
-        def show_distributed_sampler_info(distributed_sampler):
-            logger.warning(
-                f"DistributedSampler: num_replicas={distributed_sampler.num_replicas}, "
-                f"rank={distributed_sampler.rank}, "
-                f"dataset_len={len(dataset)}, "
-                f"num_samples={distributed_sampler.num_samples}"
-            )
-
-        show_distributed_sampler_info(distributed_sampler)
-        return distributed_sampler
-
     # detect whether we're in a distributed setting
     # but all ranks have the same dataset.
     if _is_dist_and_same_ds(dataset):