vllm-project
diff --git a/‎.github/workflows/set-comment.yaml‎
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/set-comment.yaml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 9 additions & 2 deletions b/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎.github/workflows/test-check.yaml‎
Lines changed: 28 additions & 7 deletions b/‎.github/workflows/test-check.yaml‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/TODO.txt‎
Lines changed: 0 additions & 1 deletion b/‎docs/TODO.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/images/architecture.png‎
-118 KB b/‎docs/images/architecture.png‎
-118 KB
diff --git a/‎docs/schemes.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/schemes.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/awq/awq_one_shot.py‎
Lines changed: 106 additions & 0 deletions b/‎examples/awq/awq_one_shot.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 2 deletions b/‎setup.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llmcompressor/modifiers/awq/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/llmcompressor/modifiers/awq/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -1,8 +1,7 @@
 name: PR Reminder Comment Bot
 on:
-  pull_request:
-    branches:
-      - main
+  pull_request_target:
+    branches: [main]
     types: [opened]
 
 jobs:
 
@@ -51,17 +51,24 @@ jobs:
         with:
           python-version: '3.9'
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install dependencies"
         run: pip3 install -U pip setuptools && pip3 install .[dev]
       - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install compressed-tensors dependencies"
         id: install
         run: |
-          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
-          pip3 install ./compressed-tensors/
+          pip3 uninstall -y compressed-tensors
+          export GIT_CEILING_DIRECTORIES="$(pwd)"
+          cd compressed-tensors
+          BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
       - name: "🔬 Running transformers tests"
 
@@ -4,7 +4,7 @@ on:
 
 env:
   CADENCE: "commit"
-  
+
 jobs:
 
   base-tests:
@@ -14,16 +14,23 @@ jobs:
         with:
           python-version: '3.12'
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install dependencies"
         run: pip3 install -U pip setuptools && pip3 install .[dev]
       - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install compressed-tensors dependencies"
         run: |
-          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
-          pip3 install ./compressed-tensors/
+          pip3 uninstall -y compressed-tensors
+          export GIT_CEILING_DIRECTORIES="$(pwd)"
+          cd compressed-tensors
+          BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
       - name: "🔬 Running base tests"
@@ -36,16 +43,23 @@ jobs:
         with:
           python-version: '3.11'
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install dependencies"
         run: pip3 install -U pip setuptools && pip3 install .[dev]
       - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install compressed-tensors dependencies"
         run: |
-          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
-          pip3 install ./compressed-tensors/
+          pip3 uninstall -y compressed-tensors
+          export GIT_CEILING_DIRECTORIES="$(pwd)"
+          cd compressed-tensors
+          BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
       - name: "🔬 Running pytorch tests"
@@ -59,16 +73,23 @@ jobs:
         with:
           python-version: '3.10'
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install dependencies"
         run: pip3 install -U pip setuptools && pip3 install .[dev]
       - uses: actions/checkout@v4
         with:
           repository: "neuralmagic/compressed-tensors"
           path: "compressed-tensors"
+          fetch-depth: 0
+          fetch-tags: true
       - name: "⚙️ Install compressed-tensors dependencies"
         run: |
-          pip3 uninstall -y compressed-tensors compressed-tensors-nightly
-          pip3 install ./compressed-tensors/
+          pip3 uninstall -y compressed-tensors
+          export GIT_CEILING_DIRECTORIES="$(pwd)"
+          cd compressed-tensors
+          BUILD_TYPE=nightly pip3 install .
       - name: "Clean compressed-tensors directory"
         run: rm -r compressed-tensors/
       - name: "🔬 Running pytorch tests"
 
@@ -20,6 +20,7 @@
 ### Supported Algorithms
 * Simple PTQ
 * GPTQ
+* AWQ
 * SmoothQuant
 * SparseGPT
 
@@ -41,7 +42,8 @@ pip install llmcompressor
 Applying quantization with `llmcompressor`:
 * [Activation quantization to `int8`](examples/quantization_w8a8_int8/README.md)
 * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8/README.md)
-* [Weight only quantization to `int4`](examples/quantization_w4a16/README.md)
+* [Weight only quantization to `int4` using GPTQ](examples/quantization_w4a16/README.md)
+* [Weight only quantization to `int4` using AWQ](examples/awq/awq_one_shot.py)
 * [Quantizing MoE LLMs](examples/quantizing_moe/README.md)
 * [Quantizing Vision-Language Models](examples/multimodal_vision/README.md)
 * [Quantizing Audio-Language Models](examples/multimodal_audio/README.md)
 
@@ -5,6 +5,7 @@ PTQ is performed to reduce the precision of quantizable weights (e.g., linear la
 
 ### [W4A16](../examples/quantization_w4a16/README.md)
 - Uses GPTQ to compress weights to 4 bits. Requires calibration dataset.
+- Optionally, [AWQ can also be leveraged for W4A16 quantization](../examples/awq/awq_one_shot.py)
 - Useful speed ups in low QPS regimes with more weight compression. 
 - Recommended for any GPUs types.
 
 
@@ -0,0 +1,106 @@
+import lm_eval
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from lm_eval.utils import make_table
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# This example demonstrates how to:
+# 1) Run the `llm-compressor` implementation of AWQ
+# 2) Evaluate the compressed model with the lm_eval framework
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+DATASET_ID = "mit-han-lab/pile-val-backup"
+DATASET_SPLIT = "validation"
+NUM_CALIBRATION_SAMPLES = 256
+MAX_SEQUENCE_LENGTH = 512
+OUTPUT_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+
+#
+# 1) Run LLM Compressor AWQ implementation
+#
+
+recipe = [
+    AWQModifier(bits=4, symmetric=False),
+    QuantizationModifier(
+        ignore=["lm_head"],
+        config_groups={
+            "group_0": QuantizationScheme(
+                targets=["Linear"],
+                weights=QuantizationArgs(
+                    num_bits=4,
+                    type=QuantizationType.INT,
+                    dynamic=False,
+                    symmetric=False,
+                    strategy=QuantizationStrategy.GROUP,
+                    group_size=128,
+                ),
+            )
+        },
+    ),
+]
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+
+
+def get_calib_dataset(tokenizer):
+    from datasets import load_dataset
+
+    ds = load_dataset(
+        DATASET_ID,
+        split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*100}]",
+    )
+
+    def preprocess(example):
+        return {
+            "input_ids": tokenizer.encode(example["text"].strip())[:MAX_SEQUENCE_LENGTH]
+        }
+
+    ds = (
+        ds.shuffle(seed=42)
+        .map(preprocess, remove_columns=ds.column_names)
+        .filter(lambda example: len(example["input_ids"]) >= MAX_SEQUENCE_LENGTH)
+        .select(range(NUM_CALIBRATION_SAMPLES))
+    )
+
+    return ds
+
+
+oneshot(
+    model=model,
+    dataset=get_calib_dataset(tokenizer=tokenizer),
+    recipe=recipe,
+    output_dir=OUTPUT_DIR,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+print("Done! model saved to", OUTPUT_DIR)
+
+#
+# 2) Evaluate model on wikitext perplexity
+#
+
+results = lm_eval.simple_evaluate(
+    model="vllm",
+    model_args={
+        "pretrained": OUTPUT_DIR,
+        "add_bos_token": True,
+        "dtype": "bfloat16",
+        "gpu_memory_utilization": 0.5,
+    },
+    tasks=["wikitext"],
+    num_fewshot=5,
+    batch_size="auto",
+)
+print(make_table(results))
@@ -102,7 +102,7 @@ def localversion_func(version: ScmVersion) -> str:
         "sparsity, optimization, model optimization, model compression, "
     ),
     license="Apache",
-    url="https://github.com/neuralmagic/llm-compressor",
+    url="https://github.com/vllm-project/llm-compressor",
     include_package_data=True,
     package_dir={"": "src"},
     packages=find_packages(
@@ -115,7 +115,7 @@ def localversion_func(version: ScmVersion) -> str:
         "requests>=2.0.0",
         "tqdm>=4.0.0",
         "torch>=1.7.0",
-        "transformers>4.0,<4.50",
+        "transformers>4.0,<5.0",
         "datasets",
         "accelerate>=0.20.3,!=1.1.0",
         "pynvml",
 
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+from .base import *
+from .mappings import *