[AWQ][DDP] adding DDP functionality to AWQ (#2457)

HDCharles · web-flow · commit 0bc916e5a8b6 · 2026-03-18T14:03:02.000Z
This PR enables AWQ to have DDP functionality. similar to [GPTQ DDP](#2333) i noticed a situation involving compounding floating point errors. With GPTQ this issue made the non DDP evaluation performance better, however this time it made the DDP evaluation performance worse. After correcting the compounding error, It looks like both DDP and non-DDP evaluation performance is more aligned with one another and its also slightly better or equal (to 2 decimal points) compared to before. see results below: ``` Script Model Time (min) GPU (GB) Flex Strict Flex(before) Strict(before) --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- examples/awq/llama_example_ddp.py ./Meta-Llama-3-8B-Instruct-awq-asym-DDP4 2.40 4.99 0.7142 0.7149 0.6983 0.6990 ++ examples/awq/llama_example.py ./Meta-Llama-3-8B-Instruct-awq-asym 7.02 10.20 0.7081 0.7074 0.7058 0.7058 ++ examples/awq/llama_example_with_masking_ddp.py ./Meta-Llama-3-8B-Instruct-awq-asym-masked-DDP4 2.67 4.98 0.7119 0.7119 examples/awq/llama_example_with_masking.py ./Meta-Llama-3-8B-Instruct-awq-asym-masked 8.13 10.14 0.7058 0.7074 examples/awq/qwen3_vl_30b_example_ddp.py ./Qwen3-VL-30B-A3B-Instruct-AWQ-W4A16-g32-DDP4 143.10 3.38 0.8764 0.8529 0.8696 0.8453 ++ examples/awq/qwen3-vl-30b-a3b-Instruct-example.py ./Qwen3-VL-30B-A3B-Instruct-AWQ-W4A16-mse-seq 446.68 3.93 0.8643 0.8491 0.8613 0.8499 +- examples/awq/qwen3_moe_example_ddp.py ./Qwen3-30B-A3B-awq-sym-DDP4 143.90 3.36 0.8802 0.8832 0.8848 0.8802 -+ examples/awq/qwen3_moe_example.py ./Qwen3-30B-A3B-awq-sym 459.65 4.13 0.8825 0.8863 0.8878 0.8840 -+ ``` ## changes: - Added distributed functionality - Accumulate activation sums instead of means to avoid floating point errors - Make everything broadcastable by changing to tensors - added helper for all_reducing with sum op Test Plan: see penultimate commit for test scripts and evaluation framework --------- Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
diff --git a/examples/awq/llama_example_ddp.py b/examples/awq/llama_example_ddp.py
@@ -0,0 +1,113 @@
+#############################################################################
+# This script is adapted from ./llama_example.py and adds DDP functionality.
+# run this with `torchrun --nproc_per_node=2 llama_example_ddp.py`
+# or change nproc_per_node to your desired configuration
+# to adapt other examples to use DDP, see the 2 altered sections below
+#############################################################################
+
+import time
+
+import torch
+from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.datasets.utils import get_rank_partition
+from llmcompressor.modifiers.awq import AWQModifier
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+init_dist()
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, dtype="auto", device_map="auto_offload"
+    )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(
+    DATASET_ID, split=get_rank_partition(DATASET_SPLIT, NUM_CALIBRATION_SAMPLES)
+)
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+recipe = [
+    AWQModifier(
+        ignore=["lm_head"], scheme="W4A16_ASYM", targets=["Linear"], duo_scaling="both"
+    ),
+]
+
+torch.cuda.reset_peak_memory_stats()
+start_time = time.time()
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+elapsed_time = time.time() - start_time
+peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
+print("Quantization Complete")
+print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
+print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = (
+    MODEL_ID.rstrip("/").split("/")[-1]
+    + "-awq-asym-DDP"
+    + str(torch.distributed.get_world_size())
+)
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+torch.distributed.destroy_process_group()
diff --git a/examples/awq/qwen3_moe_example_ddp.py b/examples/awq/qwen3_moe_example_ddp.py
@@ -0,0 +1,116 @@
+#############################################################################
+# This script is adapted from ./qwen3_moe_example.py and adds DDP functionality.
+# run this with `torchrun --nproc_per_node=2 qwen3_moe_example_ddp.py`
+# or change nproc_per_node to your desired configuration
+# to adapt other examples to use DDP, see the 2 altered sections below
+#############################################################################
+
+import time
+
+import torch
+from compressed_tensors.offload import dispatch_model, init_dist, load_offloaded_model
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.datasets.utils import get_rank_partition
+from llmcompressor.modifiers.awq import AWQModifier
+
+# Select model and load it.
+MODEL_ID = "Qwen/Qwen3-30B-A3B"
+
+init_dist()
+with load_offloaded_model():
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, dtype="auto", device_map="auto_offload"
+    )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 256 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+
+# Load dataset and preprocess.
+ds = load_dataset(
+    DATASET_ID, split=get_rank_partition(DATASET_SPLIT, NUM_CALIBRATION_SAMPLES)
+)
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+# Configure the quantization algorithm to run.
+# NOTE: vllm currently does not support asym MoE, using symmetric here
+recipe = [
+    AWQModifier(
+        ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+        scheme="W4A16",
+        targets=["Linear"],
+    ),
+]
+
+torch.cuda.reset_peak_memory_stats()
+start_time = time.time()
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+elapsed_time = time.time() - start_time
+peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
+print("Quantization Complete")
+print(f"Time: {elapsed_time / 60:.2f} minutes ({elapsed_time:.2f} seconds)")
+print(f"Peak GPU Memory: {peak_memory_gb:.2f} GB")
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to(
+    model.device
+)
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = (
+    MODEL_ID.rstrip("/").split("/")[-1]
+    + "-awq-sym-DDP"
+    + str(torch.distributed.get_world_size())
+)
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+
+torch.distributed.destroy_process_group()
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py