vllm-project
diff --git a/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/test-check-transformers.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎DEVELOPING.md‎
Lines changed: 1 addition & 2 deletions b/‎DEVELOPING.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 4 deletions b/‎Makefile‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎README.md‎
Lines changed: 10 additions & 2 deletions b/‎README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎docs/developer/developing.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/developer/developing.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/guides/saving_a_model.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/guides/saving_a_model.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/awq/llama_example.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/awq/llama_example.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/awq/qwen3_moe_example.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/awq/qwen3_moe_example.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/multimodal_vision/gemma3_example.py‎
Lines changed: 2 additions & 2 deletions
@@ -16,6 +16,10 @@ env:
   CADENCE: "commit"
   HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   detect-changes:
     runs-on: ubuntu-latest
@@ -97,14 +101,10 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/oneshot
-      - name: Running Sparsification Tests
-        if: (success() || failure()) && steps.install.outcome == 'success'
-        run: |
-          pytest tests/llmcompressor/transformers/sparsification -v
-      - name: Running OBCQ Tests
+      - name: Running SparseGPT Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
-          pytest -v tests/llmcompressor/transformers/obcq
+          pytest -v tests/llmcompressor/transformers/sparsegpt
       - name: Running Tracing Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
 
@@ -804,3 +804,8 @@ wandb/
 timings/
 output_finetune/
 env_log.json
+
+# uv artifacts
+uv.lock
+.venv/
+
@@ -24,8 +24,7 @@ make style
 make quality
 ```
 
-This will run automatic code styling using `ruff`, `flake8`, `black`, and `isort` to test that the
-repository's code matches its standards.
+This will run automatic code styling using `ruff` to test that the repository's code matches its standards.
 
 **EXAMPLE: test changes locally**
 
 
@@ -26,15 +26,12 @@ quality:
 	@echo "Running python quality checks";
 	ruff check $(CHECKDIRS);
 	ruff format --check $(CHECKDIRS);
-	isort --check-only $(CHECKDIRS);
-	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
 
 # style the code according to accepted standards for the repo
 style:
 	@echo "Running python styling";
+	ruff check --fix $(CHECKDIRS);
 	ruff format $(CHECKDIRS);
-	isort $(CHECKDIRS);
-	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
 
 # run tests for the repo
 test:
 
@@ -22,18 +22,26 @@
    <img alt="LLM Compressor Flow" src="https://github.com/user-attachments/assets/adf07594-6487-48ae-af62-d9555046d51b" width="80%" />
 </p>
 
+---
+
+💬 Join us on the [vLLM Community Slack](https://communityinviter.com/apps/vllm-dev/join-vllm-developers-slack) and share your questions, thoughts, or ideas in:
+
+- `#sig-quantization`
+- `#llm-compressor`
+
+---
+
 ## 🚀 What's New!
 
 Big updates have landed in LLM Compressor! To get a more in-depth look, check out the [deep-dive](https://x.com/RedHat_AI/status/1937865425687093554).
 
 Some of the exciting new features include:
 
+* **Quantization with Multiple Modifiers**: Multiple quantization modifiers can now be applied to the same model for mixed-precision quantization, for example applying AWQ W4A16 to a model's `self_attn` layers and GPTQ W8A8 to its `mlp` layers. This is an advanced usage of `llm-compressor` and an active area of research. See the [non-uniform quantization support](examples/quantization_non_uniform) section for more detail and [example usage](examples/quantization_non_uniform/quantization_multiple_modifiers.py).
 * **QuIP and SpinQuant-style Transforms**: The newly added [`QuIPModifier`](examples/transform/quip_example.py) and [`SpinQuantModifier`](examples/transform/spinquant_example.py) allow users to quantize their models after injecting hadamard weights into the computation graph, reducing quantization error and greatly improving accuracy recovery for low bit weight and activation quantization.
 * **DeepSeekV3-style Block Quantization Support**:  This allows for more efficient compression of large language models without needing a calibration dataset. Quantize a Qwen3 model to [W8A8](examples/quantization_w8a8_fp8/fp8_block_example.py). 
 * **Llama4 Quantization Support**: Quantize a Llama4 model to [W4A16](examples/multimodal_vision/llama4_example.py) or [NVFP4](examples/quantization_w4a4_fp4/llama4_example.py). The checkpoint produced can seamlessly run in vLLM.
 * **FP4 Quantization - now with MoE and non-uniform support:** Quantize weights and activations to FP4 and seamlessly run the compressed model in vLLM. Model weights and activations are quantized following the NVFP4 [configuration](https://github.com/neuralmagic/compressed-tensors/blob/f5dbfc336b9c9c361b9fe7ae085d5cb0673e56eb/src/compressed_tensors/quantization/quant_scheme.py#L104). See examples of [fp4 activation support](examples/quantization_w4a4_fp4/llama3_example.py), [MoE support](examples/quantization_w4a4_fp4/qwen_30b_a3b.py), and [Non-uniform quantization support](examples/quantization_non_uniform) where some layers are selectively quantized to fp8 for better recovery. You can also mix other quantization schemes, such as int8 and int4.
-* **Large Model Support with Sequential Onloading**: As of llm-compressor>=0.6.0, you can now quantize very large language models on a single GPU. Models are broken into disjoint layers which are then onloaded to the GPU one layer at a time. For more information on sequential onloading, see [Big Modeling with Sequential Onloading](examples/big_models_with_sequential_onloading/README.md) as well as the [DeepSeek-R1 Example](examples/quantizing_moe/deepseek_r1_example.py).
-* **Axolotl Sparse Finetuning Integration:** Seamlessly finetune sparse LLMs with our Axolotl integration. Learn how to create [fast sparse open-source models with Axolotl and LLM Compressor](https://developers.redhat.com/articles/2025/06/17/axolotl-meets-llm-compressor-fast-sparse-open). See also the [Axolotl integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#llmcompressor).
 
 ### Supported Formats
 * Activation Quantization: W8A8 (int8 and fp8)
 
@@ -29,8 +29,7 @@ make style
 make quality
 ```
 
-This will run automatic code styling using `ruff`, `flake8`, `black`, and `isort` to test that the
-repository's code matches its standards.
+This will run automatic code styling using `ruff` to test that the repository's code matches its standards.
 
 **EXAMPLE: test changes locally**
 
 
@@ -69,7 +69,7 @@ If you need more control, you can wrap `save_pretrained` manually:
 
 ```python
 from transformers import AutoModelForCausalLM
-from llmcompressor.transformers.sparsification.compressed_tensors_utils import modify_save_pretrained
+from llmcompressor.transformers.compression.compressed_tensors_utils import modify_save_pretrained
 
 # Load model
 model = AutoModelForCausalLM.from_pretrained("your-model")
 
@@ -12,8 +12,8 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
-DATASET_ID = "mit-han-lab/pile-val-backup"
-DATASET_SPLIT = "validation"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
 
 # Select number of samples. 256 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
@@ -28,7 +28,7 @@
 def preprocess(example):
     return {
         "text": tokenizer.apply_chat_template(
-            [{"role": "user", "content": example["text"]}],
+            example["messages"],
             tokenize=False,
         )
     }
 
@@ -12,8 +12,8 @@
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 # Select calibration dataset.
-DATASET_ID = "mit-han-lab/pile-val-backup"
-DATASET_SPLIT = "validation"
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
 
 # Select number of samples. 256 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
@@ -28,7 +28,7 @@
 def preprocess(example):
     return {
         "text": tokenizer.apply_chat_template(
-            [{"role": "user", "content": example["text"]}],
+            example["messages"],
             tokenize=False,
         )
     }
 
@@ -32,8 +32,8 @@ def data_collator(batch):
         scheme="W4A16",
         ignore=[
             "lm_head",
-            "re:model\.vision_tower.*",
-            "re:model\.multi_modal_projector.*",
+            r"re:model\.vision_tower.*",
+            r"re:model\.multi_modal_projector.*",
         ],
     ),
 ]