neuralmagic
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/custom_integrations.qmd‎
Lines changed: 2 additions & 1 deletion b/‎docs/custom_integrations.qmd‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/llama-3/sparse-finetuning.yaml‎
Lines changed: 77 additions & 0 deletions b/‎examples/llama-3/sparse-finetuning.yaml‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 0 deletions b/‎setup.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/axolotl/integrations/llm_compressor/README.md‎
Lines changed: 108 additions & 0 deletions b/‎src/axolotl/integrations/llm_compressor/README.md‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/axolotl/integrations/llm_compressor/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/axolotl/integrations/llm_compressor/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/axolotl/integrations/llm_compressor/args.py‎
Lines changed: 40 additions & 0 deletions b/‎src/axolotl/integrations/llm_compressor/args.py‎
Lines changed: 40 additions & 0 deletions
@@ -261,6 +261,18 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras: llmcompressor
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
 
@@ -49,7 +49,8 @@ sections = [
     ("Knowledge Distillation (KD)", "kd"),
     ("Liger Kernels", "liger"),
     ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum")
+    ("Spectrum", "spectrum"),
+    ("LLMCompressor", "llm_compressor")
 ]
 
 for section_name, folder_name in sections:
 
@@ -0,0 +1,77 @@
+base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
+
+plugins:
+  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+eval_sample_packing: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: <|end_of_text|>
+
+llmcompressor:
+  recipe:
+    finetuning_stage:
+      finetuning_modifiers:
+        ConstantPruningModifier:
+          targets: [
+            're:.*q_proj.weight',
+            're:.*k_proj.weight',
+            're:.*v_proj.weight',
+            're:.*o_proj.weight',
+            're:.*gate_proj.weight',
+            're:.*up_proj.weight',
+            're:.*down_proj.weight',
+          ]
+          start: 0
+  save_compressed: true
@@ -149,6 +149,9 @@ def get_package_version():
     "vllm": [
         "vllm==0.7.2",
     ],
+    "llmcompressor": [
+        "llmcompressor==0.5.1",
+    ],
 }
 
 install_requires, dependency_links, extras_require_build = parse_requirements(
 
@@ -0,0 +1,108 @@
+# LLMCompressor Integration
+
+Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
+
+This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
+
+It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
+
+---
+
+## Requirements
+
+- Axolotl with `llmcompressor` extras:
+
+  ```bash
+  pip install "axolotl[llmcompressor]"
+  ```
+
+- Requires `llmcompressor >= 0.5.1`
+
+This will install all necessary dependencies to fine-tune sparsified models using the integration.
+
+---
+
+## Usage
+
+To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
+
+```yaml
+plugins:
+  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
+
+llmcompressor:
+  recipe:
+    finetuning_stage:
+      finetuning_modifiers:
+        ConstantPruningModifier:
+          targets: [
+            're:.*q_proj.weight',
+            're:.*k_proj.weight',
+            're:.*v_proj.weight',
+            're:.*o_proj.weight',
+            're:.*gate_proj.weight',
+            're:.*up_proj.weight',
+            're:.*down_proj.weight',
+          ]
+          start: 0
+  save_compressed: true
+# ... (other training arguments)
+```
+
+This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
+
+Pre-sparsified checkpoints can be:
+- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
+- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
+- Any custom LLM with compatible sparsity patterns that you've created yourself
+
+To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
+[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
+
+### Storage Optimization with save_compressed
+
+Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
+- Reduces disk space usage by approximately 40%
+- Maintains compatibility with vLLM for accelerated inference
+- Maintains compatibility with llmcompressor for further optimization (example: quantization)
+
+This option is highly recommended when working with sparse models to maximize the benefits of model compression.
+
+### Example Config
+
+See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
+
+---
+
+## Inference with vLLM
+
+After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
+You can also use LLMCompressor to apply additional quantization to your fine-tuned
+sparse model before inference for even greater performance benefits.:
+
+```python
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+llm = LLM("path/to/your/sparse/model")
+outputs = llm.generate(prompts, sampling_params)
+
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
+
+## Learn More
+
+For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
+
+[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
@@ -0,0 +1,5 @@
+"""Integration entry point for the LLMCompressor plugin."""
+
+from .plugin import LLMCompressorPlugin
+
+__all__ = ["LLMCompressorPlugin"]
@@ -0,0 +1,40 @@
+"""
+LLMCompressor and Sparse Finetuning config models.
+"""
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+
+class CompressionArgs(BaseModel):
+    """Sparse Finetuning config for LLMCompressor."""
+
+    # Typing for recipe is set to Any due to:
+    # https://github.com/vllm-project/llm-compressor/issues/1319
+    recipe: Annotated[
+        Any,
+        Field(
+            description="The recipe containing the compression algorithms and hyperparameters to apply."
+        ),
+    ]
+
+    save_compressed: Annotated[
+        bool,
+        Field(
+            default=False,
+            description="Whether to save the compressed model after training.",
+        ),
+    ]
+
+
+class LLMCompressorArgs(BaseModel):
+    """LLMCompressor configuration BaseModel."""
+
+    llmcompressor: Annotated[
+        CompressionArgs,
+        Field(
+            description="Arguments enabling compression pathways through the LLM Compressor plugins"
+        ),
+    ]
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,8 @@ sections = [`
`49`	`49`	`("Knowledge Distillation (KD)", "kd"),`
`50`	`50`	`("Liger Kernels", "liger"),`
`51`	`51`	`("Language Model Evaluation Harness (LM Eval)", "lm_eval"),`
`52`		`- ("Spectrum", "spectrum")`
	`52`	`+ ("Spectrum", "spectrum"),`
	`53`	`+ ("LLMCompressor", "llm_compressor")`
`53`	`54`	`]`
`54`	`55`
`55`	`56`	`for section_name, folder_name in sections:`
Original file line number	Diff line number	Diff line change
`@@ -149,6 +149,9 @@ def get_package_version():`
`149`	`149`	`"vllm": [`
`150`	`150`	`"vllm==0.7.2",`
`151`	`151`	`],`
	`152`	`+ "llmcompressor": [`
	`153`	`+ "llmcompressor==0.5.1",`
	`154`	`+ ],`
`152`	`155`	`}`
`153`	`156`
`154`	`157`	`install_requires, dependency_links, extras_require_build = parse_requirements(`