EleutherAI
diff --git a/‎.github/workflows/new_tasks.yml‎
Lines changed: 10 additions & 14 deletions b/‎.github/workflows/new_tasks.yml‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 22 additions & 17 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 22 additions & 17 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 83 additions & 23 deletions b/‎README.md‎
Lines changed: 83 additions & 23 deletions
diff --git a/‎lm_eval/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/__init__.py‎
Lines changed: 9 additions & 0 deletions
@@ -16,7 +16,7 @@ jobs:
     name: Scan for changed tasks
     steps:
       - name: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
 
@@ -25,7 +25,7 @@ jobs:
       # and prepends the filter name to the standard output names.
       - name: Check task folders
         id: changed-tasks
-        uses: tj-actions/changed-files@v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62
         with:
           # tasks checks the tasks folder and api checks the api folder for changes
           files_yaml: |
@@ -44,28 +44,24 @@ jobs:
           echo "One or more test file(s) has changed."
           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
 
-      - name: Set up Python 3.10
+      - name: Install uv
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v5
+        uses: astral-sh/setup-uv@v7
         with:
-          python-version: '3.10'
-          cache: 'pip'
-          cache-dependency-path: pyproject.toml
+          enable-cache: true
+          python-version: "3.10"
+          activate-environment: true
       - name: Install dependencies
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
-            python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval,unitxt,math,longbench]' --extra-index-url https://download.pytorch.org/whl/cpu
-    #   Install optional git dependencies
-    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          uv pip install -e '.[dev,ifeval,unitxt,math,longbench,hf]' --extra-index-url https://download.pytorch.org/whl/cpu
       - name: Test with pytest
         # if new tasks are added, run tests on them
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+        run: pytest -x -s -vv tests/test_tasks.py
         # if api is modified, run tests on it
       - name: Test more tasks with pytest
         env:
           API: true
         if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+        run: pytest -x -s -vv -n=auto tests/test_tasks.py
@@ -21,17 +21,23 @@ jobs:
 
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+        uses: actions/checkout@v6
         with:
-          python-version: '3.10'
-          cache: pip
-          cache-dependency-path: pyproject.toml
+          fetch-depth: 0
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          python-version: "3.10"
+          activate-environment: true
+      - name: Install pip
+        run: uv pip install pip
       - name: Pre-Commit
         env:
           SKIP: "no-commit-to-branch,mypy"
         uses: pre-commit/[email protected]
+        with:
+          extra_args: --from-ref ${{ github.event.pull_request.base.sha || 'HEAD~1' }} --to-ref HEAD
   # Job 2
   testcpu:
     name: CPU Tests
@@ -43,13 +49,13 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/checkout@v6
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
         with:
+          enable-cache: true
           python-version: ${{ matrix.python-version }}
-          cache: pip
-          cache-dependency-path: pyproject.toml
+          activate-environment: true
 
       # Cache HuggingFace cache directory for CPU tests
       - name: Cache HuggingFace cache (CPU tests)
@@ -63,17 +69,16 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install hf_xet
+          uv pip install -e '.[dev,unitxt,hf]' --extra-index-url https://download.pytorch.org/whl/cpu
+          uv pip install hf_xet
 
       - name: Test with pytest
-        run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
-        continue-on-error: true  # Continue workflow even if tests fail
+        run: pytest -x --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py --ignore=tests/scripts/test_zeno_visualize.py
 
       # Save test artifacts
       - name: Archive test artifacts
-        uses: actions/upload-artifact@v4
+        if: always()  # Upload artifacts even if tests fail
+        uses: actions/upload-artifact@v5
         with:
           name: output_testcpu${{ matrix.python-version }}
           path: |
 
@@ -45,3 +45,6 @@ examples/wandb/
 
 # PyInstaller
 *.spec
+
+#uv
+uv.lock
@@ -27,7 +27,7 @@ repos:
       - id: mixed-line-ending
         args: [ --fix=lf ]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.13.2
+    rev: v0.14.6
     hooks:
       # Run the linter.
       - id: ruff-check
@@ -46,7 +46,7 @@ repos:
 
         args: [ --check-filenames, --check-hidden, --ignore-words=ignore.txt ]
   - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.32
+    rev: v0.9.33
     hooks:
       - id: pymarkdown
         exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
 
@@ -63,7 +63,35 @@ cd lm-evaluation-harness
 pip install -e .
 ```
 
-We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
+### Installing Model Backends
+
+The base installation provides the core evaluation framework. **Model backends must be installed separately** using optional extras:
+
+For HuggingFace transformers models:
+
+```bash
+pip install "lm_eval[hf]"
+```
+
+For vLLM inference:
+
+```bash
+pip install "lm_eval[vllm]"
+```
+
+For API-based models (OpenAI, Anthropic, etc.):
+
+```bash
+pip install "lm_eval[api]"
+```
+
+Multiple backends can be installed together:
+
+```bash
+pip install "lm_eval[hf,vllm,api]"
+```
+
+A detailed table of all optional extras is available at the end of this document.
 
 ## Basic Usage
 
@@ -75,6 +103,9 @@ A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --
 
 ### Hugging Face `transformers`
 
+> [!Important]
+> To use the HuggingFace backend, first install: `pip install "lm_eval[hf]"`
+
 To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):
 
 ```bash
@@ -307,9 +338,9 @@ lm_eval --model vllm \
     --batch_size auto
 ```
 
-To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
+To use vllm, do `pip install "lm_eval[vllm]"`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
 
-vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
+vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
 
 > [!Tip]
 > For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
@@ -336,14 +367,17 @@ lm_eval --model sglang \
 ```
 
 > [!Tip]
-> When encountering out of memory (OOM) errors (especially for multiple-choice tasks), try these solutions:
+> When encountering out-of-memory (OOM) errors (especially for multiple-choice tasks), try these solutions:
 >
 > 1. Use a manual `batch_size`, rather than `auto`.
 > 2. Lower KV cache pool memory usage by adjusting `mem_fraction_static` - Add to your model arguments for example `--model_args pretrained=...,mem_fraction_static=0.7`.
 > 3. Increase tensor parallel size `tp_size` (if using multiple GPUs).
 
 ### Model APIs and Inference Servers
 
+> [!Important]
+> To use API-based models, first install: `pip install "lm_eval[api]"`
+
 Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
 
 To call a hosted model, use:
@@ -581,7 +615,7 @@ To get started with development, first clone the repository and install the dev
 ```bash
 git clone https://github.com/EleutherAI/lm-evaluation-harness
 cd lm-evaluation-harness
-pip install -e ".[dev]"
+pip install -e ".[dev,hf]"
 ````
 
 ### Implementing new tasks
@@ -607,24 +641,50 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
 
 Extras dependencies can be installed via `pip install -e ".[NAME]"`
 
-| NAME                 | Description                    | NAME           | Description                           |
-|----------------------|--------------------------------|----------------|---------------------------------------|
-| tasks                | All task-specific dependencies | api            | API models (Anthropic, OpenAI, local) |
-| acpbench             | ACP Bench tasks                | audiolm_qwen   | Qwen2 audio models                    |
-| ifeval               | IFEval task                    |                |                                       |
-| japanese_leaderboard | Japanese LLM tasks             | gptq           | AutoGPTQ models                       |
-| longbench            | LongBench tasks                | gptqmodel      | GPTQModel models                      |
-| math                 | Math answer checking           | hf_transfer    | Speed up HF downloads                 |
-| multilingual         | Multilingual tokenizers        | ibm_watsonx_ai | IBM watsonx.ai models                 |
-| ruler                | RULER tasks                    | ipex           | Intel IPEX backend                    |
-|                      |                                |                |                                       |
-| dev                  | Linting & contributions        | mamba          | Mamba SSM models                      |
-| promptsource         | PromptSource prompts           | neuronx        | AWS inf2 instances                    |
-| sentencepiece        | Sentencepiece tokenizer        | optimum        | Intel OpenVINO models                 |
-| testing              | Run test suite                 | sae_lens       | SAELens model steering                |
-| unitxt               | Run unitxt tasks               |                |                                       |
-| wandb                | Weights & Biases               | sparsify       | Sparsify model steering               |
-| zeno                 | Result visualization           | vllm           | vLLM models                           |
+### Model Backends
+
+These extras install dependencies required to run specific model backends:
+
+| NAME           | Description                                      |
+|----------------|--------------------------------------------------|
+| hf             | HuggingFace Transformers (torch, transformers, accelerate, peft) |
+| vllm           | vLLM fast inference                              |
+| api            | API models (OpenAI, Anthropic, local servers)    |
+| gptq           | AutoGPTQ quantized models                        |
+| gptqmodel      | GPTQModel quantized models                       |
+| ibm_watsonx_ai | IBM watsonx.ai models                            |
+| ipex           | Intel IPEX backend                               |
+| optimum        | Intel OpenVINO models                            |
+| neuronx        | AWS Inferentia2 instances                        |
+| sparsify       | Sparsify model steering                          |
+| sae_lens       | SAELens model steering                           |
+
+### Task Dependencies
+
+These extras install dependencies required for specific evaluation tasks:
+
+| NAME                 | Description                    |
+|----------------------|--------------------------------|
+| tasks                | All task-specific dependencies |
+| acpbench             | ACP Bench tasks                |
+| audiolm_qwen         | Qwen2 audio models             |
+| ifeval               | IFEval task                    |
+| japanese_leaderboard | Japanese LLM tasks             |
+| longbench            | LongBench tasks                |
+| math                 | Math answer checking           |
+| multilingual         | Multilingual tokenizers        |
+| ruler                | RULER tasks                    |
+
+### Development & Utilities
+
+| NAME          | Description                    |
+|---------------|--------------------------------|
+| dev           | Linting & contributions        |
+| hf_transfer   | Speed up HF downloads          |
+| sentencepiece | Sentencepiece tokenizer        |
+| unitxt        | Unitxt tasks                   |
+| wandb         | Weights & Biases logging       |
+| zeno          | Zeno result visualization      |
 
 ## Cite as
 
 
@@ -4,6 +4,15 @@
 
 __version__ = "0.4.9.2"
 
+# Enable hf_transfer if available
+try:
+    import hf_transfer  # type: ignore
+    import huggingface_hub.constants  # type: ignore
+
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass
+
 
 # Lazy-load .evaluator module to improve CLI startup
 def __getattr__(name):
-Original file line number
+Diff line change
 # PyInstaller
 *.spec
++
 +#uv
 +uv.lock