diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml index 4120c3898f..ea5c49e82e 100644 --- a/.github/workflows/test-check-transformers.yaml +++ b/.github/workflows/test-check-transformers.yaml @@ -8,43 +8,53 @@ on: env: CADENCE: "commit" - + jobs: detect-changes: runs-on: ubuntu-latest - outputs: - changes-present: ${{ steps.changed-files.outputs.any_modified }} - + changes-present: ${{ steps.changed-files.outputs.all_changed_files }} steps: + - name: prelim + run: | + echo "github.event_name=${{ github.event_name }}" + echo "github.event.pull_request.head.repo.full_name=${{ github.event.pull_request.head.repo.full_name }}" + echo "github.repository=${{ github.repository }}" + echo "repo=${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }}" + echo "changed_files=${{ github.event.pull_request.changed_files }}" + echo "ref=${{ github.event.pull_request.head.ref }}" + env + - name: Checkout uses: actions/checkout@v4 with: + path: '' fetch-depth: 0 + repository: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name || github.repository }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || 'main' }} + - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v45 + uses: neuralmagic/nm-actions/actions/changed-files@update-change-tracking with: - files: | - ** - !examples/** - !tests/e2e/** - !tests/lmeval/** - !tests/examples/** - !**/*.md - !.github/** - .github/workflows/test-check-transformers.yaml - + include-patterns: |- + ^\.github/workflows/test-check-transformers.yaml + .* + exclude-patterns: |- + ^examples + ^tests/e2e + ^tests/lmeval + ^tests/examples + .md$ - name: Log relevant output run: | - echo "changes-present: ${{ steps.changed-files.outputs.any_modified }}" - echo "all modified files: ${{ steps.changed-files.outputs.all_modified_files }}" + echo "changes-present: ${{ steps.changed-files.outputs.all_changed_files }}" shell: bash transformers-tests: needs: [detect-changes] runs-on: gcp-k8s-vllm-l4-solo - if: (contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push') && needs.detect-changes.outputs.changes-present == 'true' + if: (contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push') && needs.detect-changes.outputs.changes-present != '' steps: - uses: actions/setup-python@v5 with: diff --git a/README.md b/README.md index 3ae778835d..95b309f361 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # tool icon LLM Compressor `llmcompressor` is an easy-to-use library for optimizing models for deployment with `vllm`, including: + + * Comprehensive set of quantization algorithms for weight-only and activation quantization * Seamless integration with Hugging Face models and repositories * `safetensors`-based file format compatible with `vllm` @@ -30,16 +32,16 @@ PTQ is performed to reduce the precision of quantizable weights (e.g., linear la ##### [W4A16](./examples/quantization_w4a16/README.md) - Uses GPTQ to compress weights to 4 bits. Requires calibration dataset. -- Useful speed ups in low QPS regimes with more weight compression. -- Recommended for any GPUs types. +- Useful speed ups in low QPS regimes with more weight compression. +- Recommended for any GPUs types. ##### [W8A8-INT8](./examples/quantization_w8a8_int8/README.md) - Uses channel-wise quantization to compress weights to 8 bits using GPTQ, and uses dynamic per-token quantization to compress activations to 8 bits. Requires calibration dataset for weight quantization. Activation quantization is carried out during inference on vLLM. -- Useful for speed ups in high QPS regimes or offline serving on vLLM. -- Recommended for NVIDIA GPUs with compute capability <8.9 (Ampere, Turing, Volta, Pascal, or older). +- Useful for speed ups in high QPS regimes or offline serving on vLLM. +- Recommended for NVIDIA GPUs with compute capability <8.9 (Ampere, Turing, Volta, Pascal, or older). ##### [W8A8-FP8](./examples/quantization_w8a8_fp8/README.md) - Uses channel-wise quantization to compress weights to 8 bits, and uses dynamic per-token quantization to compress activations to 8 bits. Does not require calibration dataset. Activation quantization is carried out during inference on vLLM. -- Useful for speed ups in high QPS regimes or offline serving on vLLM. -- Recommended for NVIDIA GPUs with compute capability >8.9 (Hopper and Ada Lovelace). +- Useful for speed ups in high QPS regimes or offline serving on vLLM. +- Recommended for NVIDIA GPUs with compute capability >8.9 (Hopper and Ada Lovelace). #### Sparsification Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include: diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index 5e907b802b..0e9c068e96 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -1,5 +1,7 @@ # Compression and Fine-tuning Entrypoint + + ## Oneshot An ideal compression technique reduces memory footprint while maintaining accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification. @@ -17,7 +19,7 @@ Sparsification reduces model complexity by pruning selected weight values to zer ## Code -Example scripts for all the above formats are located in the [examples](../../../examples/) folder. The [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: +Example scripts for all the above formats are located in the [examples](../../../examples/) folder. The [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: ```python from transformers import AutoModelForCausalLM, AutoTokenizer @@ -68,7 +70,7 @@ oneshot( ..., output_dir="./oneshot_model", # Automatically save the safetensor, config, recipe. Weights are saved in a compressed format ) -``` +``` ### Lifecycle @@ -81,9 +83,9 @@ The oneshot calibration lifecycle consists of three steps: - Patches the model to include additional functionality for saving with quantization configurations. 2. **Oneshot Calibration**: - - Compresses the model based on the recipe (instructions for optimizing the model). The + - Compresses the model based on the recipe (instructions for optimizing the model). The recipe defines the `Modifiers` (e.g., `GPTQModifier`, `SparseGPTModifier`) to apply, which - contain logic how to quantize or sparsify a model. + contain logic how to quantize or sparsify a model. 3. **Postprocessing**: - Saves the model, tokenizer/processor, and configuration to the specified `output_dir`. @@ -147,7 +149,7 @@ Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pyto ```python # Define the teacher model distill_teacher = AutoModelForCausalLM.from_pretrained( - "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", ) @@ -189,7 +191,7 @@ The output terminal will provide the sparsification, quantization and training m train_steps_per_second = 0.107 ``` -### End-to-end Script +### End-to-end Script The end-to-end script for carrying out `oneshot` for `W8A8-FP8` and then knowledge distillation is shown below: ```python @@ -276,4 +278,4 @@ with create_session(): TRL's SFT Trainer can be used for sparse fine-tuning or applying sparse knowledge distillation. Examples are available in the `examples/` folder. - [Sparse-fine-tune a 50% sparse Llama-7b model](../../../examples/trl_mixin/README.md) -- [Sparse-fine-tune a 50% sparse Llama-7b model using knowledge distillation](../../../examples/trl_mixin/README.md) \ No newline at end of file +- [Sparse-fine-tune a 50% sparse Llama-7b model using knowledge distillation](../../../examples/trl_mixin/README.md) diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py index 2becd67f11..723ae481a1 100644 --- a/src/llmcompressor/transformers/__init__.py +++ b/src/llmcompressor/transformers/__init__.py @@ -2,6 +2,8 @@ Tools for integrating LLM Compressor with transformers training flows """ + + # flake8: noqa # isort: skip_file diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py index 584780bfbd..c56ea6556e 100644 --- a/tests/llmcompressor/utils/test_helpers.py +++ b/tests/llmcompressor/utils/test_helpers.py @@ -1,5 +1,7 @@ from types import SimpleNamespace + + import pytest import torch diff --git a/tests/lmeval/configs/fp8_static_per_tensor.yaml b/tests/lmeval/configs/fp8_static_per_tensor.yaml index e4d31cef25..8c768e0ad7 100644 --- a/tests/lmeval/configs/fp8_static_per_tensor.yaml +++ b/tests/lmeval/configs/fp8_static_per_tensor.yaml @@ -1,4 +1,6 @@ cadence: "weekly" + + model: meta-llama/Meta-Llama-3-8B-Instruct scheme: FP8 dataset_id: HuggingFaceH4/ultrachat_200k