remove all; reformat table (#3107)

baberabb · web-flow · commit 28001d2981cb · 2025-07-05T09:54:17.000+05:00
diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
@@ -50,12 +50,12 @@ jobs:
         with:
           python-version: 3.9
           cache: 'pip'
-          cache-dependency-path: setup.py
+          cache-dependency-path: pyproject.toml
       - name: Install dependencies
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
             python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev,ifeval,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
     #   Install optional git dependencies
     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -53,7 +53,7 @@ jobs:
 
       # Cache HuggingFace cache directory for CPU tests
       - name: Cache HuggingFace cache (CPU tests)
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         id: cache-hf-cpu
         with:
           path: ~/.cache/huggingface
@@ -64,7 +64,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
           pip install hf_xet
 
       - name: Test with pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
       - id: mixed-line-ending
         args: [--fix=lf]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.10
+    rev: v0.12.2
     hooks:
       # Run the linter.
       - id: ruff
@@ -47,7 +47,7 @@ repos:
           )$
         args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
   - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.29
+    rev: v0.9.30
     hooks:
       - id: pymarkdown
         exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
diff --git a/README.md b/README.md
@@ -599,37 +599,24 @@ The best way to get support is to open an issue on this repo or join the [Eleuth
 
 Extras dependencies can be installed via `pip install -e ".[NAME]"`
 
-| Name                 | Use                                                   |
-| -------------------- | ----------------------------------------------------- |
-| api                  | For using api models (Anthropic, OpenAI API)          |
-| audiolm_qwen         | For running Qwen2 audio models                        |
-| deepsparse           | For running NM's DeepSparse models                    |
-| dev                  | For linting PRs and contributions                     |
-| gptq                 | For loading models with AutoGPTQ                      |
-| gptqmodel            | For loading models with GPTQModel                     |
-| hf_transfer          | For speeding up HF Hub file downloads                 |
-| ibm_watsonx_ai       | For using IBM watsonx.ai model apis                   |
-| ifeval               | For running the IFEval task                           |
-| ipex                 | For running on optimum-intel ipex backend             |
-| japanese_leaderboard | For running Japanese LLM Leaderboard tasks            |
-| longbench            | For running LongBench tasks                           |
-| mamba                | For loading Mamba SSM models                          |
-| math                 | For running math task answer checking                 |
-| multilingual         | For multilingual tokenizers                           |
-| neuronx              | For running on AWS inf2 instances                     |
-| optimum              | For running Intel OpenVINO models                     |
-| promptsource         | For using PromptSource prompts                        |
-| ruler                | For running RULER tasks                               |
-| sae_lens             | For using SAELens to steer models                     |
-| sentencepiece        | For using the sentencepiece tokenizer                 |
-| sparseml             | For using NM's SparseML models                        |
-| sparsify             | For using Sparsify to steer models                    |
-| testing              | For running library test suite                        |
-| vllm                 | For loading models with vLLM                          |
-| wandb                | For integration with `Weights and Biases` platform    |
-| zeno                 | For visualizing results with Zeno                     |
-| -------------------- | ----------------------------------------------------- |
-| all                  | Loads all extras (not recommended)                    |
+| NAME                 | Description                    | NAME           | Description                           |
+|----------------------|--------------------------------|----------------|---------------------------------------|
+| tasks                | All task-specific dependencies | api            | API models (Anthropic, OpenAI, local) |
+| acpbench             | ACP Bench tasks                | audiolm_qwen   | Qwen2 audio models                    |
+| ifeval               | IFEval task                    | deepsparse     | DeepSparse models (CPU)               |
+| japanese_leaderboard | Japanese LLM tasks             | gptq           | AutoGPTQ models                       |
+| longbench            | LongBench tasks                | gptqmodel      | GPTQModel models                      |
+| math                 | Math answer checking           | hf_transfer    | Speed up HF downloads                 |
+| multilingual         | Multilingual tokenizers        | ibm_watsonx_ai | IBM watsonx.ai models                 |
+| ruler                | RULER tasks                    | ipex           | Intel IPEX backend                    |
+|                      |                                |                |                                       |
+| dev                  | Linting & contributions        | mamba          | Mamba SSM models                      |
+| promptsource         | PromptSource prompts           | neuronx        | AWS inf2 instances                    |
+| sentencepiece        | Sentencepiece tokenizer        | optimum        | Intel OpenVINO models                 |
+| testing              | Run test suite                 | sae_lens       | SAELens model steering                |
+| unitxt               | Run unitxt tasks               | sparseml       | SparseML models (CPU)                 |
+| wandb                | Weights & Biases               | sparsify       | Sparsify model steering               |
+| zeno                 | Result visualization           | vllm           | vLLM models                           |
 
 ## Cite as
 
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
@@ -3,15 +3,19 @@
 import json
 import logging
 import os
-from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, TypeVar, Union
 
-import transformers
-from sqlitedict import SqliteDict
 from tqdm import tqdm
 
 from lm_eval import utils
 
 
+if TYPE_CHECKING:
+    from sqlitedict import SqliteDict
+
+    from lm_eval.api.instance import Instance
+
+
 eval_logger = logging.getLogger(__name__)
 
 T = TypeVar("T", bound="LM")
@@ -27,10 +31,10 @@ def __init__(self) -> None:
         # set rank and world size to a single process, by default.
         self._rank = 0
         self._world_size = 1
-        self.cache_hook = CacheHook(None)
+        self.cache_hook: "CacheHook" = CacheHook(None)
 
     @abc.abstractmethod
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood(self, requests) -> list[tuple[float, bool]]:
         """Compute log-likelihood of generating a continuation from a context.
         Downstream tasks should attempt to use loglikelihood instead of other
         LM calls whenever possible.
@@ -55,7 +59,7 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
         pass
 
     @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(self, requests) -> list[float]:
         """Compute full log-likelihood of a string, with no truncation, for perplexity computation
         - We will use the full max context length of the model.
         - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
@@ -97,7 +101,7 @@ def loglikelihood_rolling(self, requests) -> List[float]:
 
     # TODO: Add an optional max length
     @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> list[str]:
         """Generate greedily until a stopping sequence
 
         :param requests: list[Instance]
@@ -114,7 +118,7 @@ def generate_until(self, requests) -> List[str]:
         pass
 
     def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+        self, chat_history: list[dict[str, str]], add_generation_prompt=True
     ) -> str:
         """
         Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
@@ -165,8 +169,7 @@ def create_from_arg_obj(
         - Instance of the LM class.
         """
 
-        additional_config = {} if additional_config is None else additional_config
-        additional_config = {
+        additional_config = additional_config or {} | {
             k: v for k, v in additional_config.items() if v is not None
         }
 
@@ -204,56 +207,58 @@ def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str
 
         return ""
 
-    def set_cache_hook(self, cache_hook) -> None:
+    def set_cache_hook(self, cache_hook: "CacheHook") -> None:
         self.cache_hook = cache_hook
 
 
 ### SQLite-based caching of LM responses
-def hash_args(attr, args):
+def hash_args(attr: str, args: Iterable[Any]) -> str:
     dat = json.dumps([attr] + list(args))
     return hashlib.sha256(dat.encode("utf-8")).hexdigest()
 
 
 class CacheHook:
-    def __init__(self, cachinglm) -> None:
+    def __init__(self, cachinglm: Optional["CachingLM"]) -> None:
         if cachinglm is None:
-            self.dbdict = None
+            self.dbdict: Optional["SqliteDict"] = None
             return
 
         self.dbdict = cachinglm.dbdict
 
-    def add_partial(self, attr, req, res) -> None:
+    def add_partial(self, attr: str, req: Iterable[Any], res: Any) -> None:
         if self.dbdict is None:
             return
         hsh = hash_args(attr, req)
         self.dbdict[hsh] = res
 
 
 class CachingLM:
-    def __init__(self, lm, cache_db) -> None:
+    def __init__(self, lm: LM, cache_db: str) -> None:
         """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
 
         :param lm: LM
             Underlying LM
         :param cache_db: str
             Path to cache db
         """
-        self.lm = lm
-        self.cache_db = cache_db
+        from sqlitedict import SqliteDict
+
+        self.lm: LM = lm
+        self.cache_db: str = cache_db
         if os.path.dirname(cache_db):
             os.makedirs(os.path.dirname(cache_db), exist_ok=True)
         self.dbdict = SqliteDict(cache_db, autocommit=True)
 
         # add hook to lm
         lm.set_cache_hook(self.get_cache_hook())
 
-    def __getattr__(self, attr: str):
+    def __getattr__(self, attr: str) -> Any:
         lm_attr = getattr(self.lm, attr)
         if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
             eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
             return lm_attr
 
-        def fn(requests):
+        def _fn(requests: list["Instance"]) -> list["Instance"]:
             res = []
             remaining_reqs = []
             warned = False
@@ -306,9 +311,9 @@ def fn(requests):
 
             return res
 
-        return fn
+        return _fn
 
-    def get_cache_hook(self):
+    def get_cache_hook(self) -> "CacheHook":
         return CacheHook(self)
 
 
@@ -331,19 +336,23 @@ def prefix_token_id(self):
         return self.eot_token_id
 
     @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
+    def tok_encode(self, string: str, **kwargs) -> list[int]:
         """
         Tokenize a string using the model's tokenizer and return a list of token IDs.
         """
         pass
 
     @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+    def _loglikelihood_tokens(
+        self, requests: list["Instance"], **kwargs
+    ) -> list[tuple[float, bool]]:
         pass
 
     def _encode_pair(
         self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
+    ) -> tuple[list[int], list[int]]:
+        import transformers
+
         n_spaces = len(context) - len(context.rstrip())
         if n_spaces > 0:
             continuation = context[-n_spaces:] + continuation
@@ -364,8 +373,8 @@ def _encode_pair(
         return context_enc, continuation_enc
 
     def loglikelihood(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
+        self, requests: list["Instance"], disable_tqdm: bool = False
+    ) -> list[tuple[float, bool]]:
         new_reqs = []
         for context, continuation in [req.args for req in requests]:
             if context == "":
@@ -384,11 +393,11 @@ def loglikelihood(
     @abc.abstractmethod
     def loglikelihood_rolling(
         self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
+    ) -> list[float]:
         pass
 
     @abc.abstractmethod
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> list[str]:
         pass
 
     def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,7 +61,7 @@ acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planne
 api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 audiolm_qwen = ["librosa", "soundfile"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 gptqmodel = ["gptqmodel>=1.0.9"]
 hf_transfer = ["hf_transfer"]
@@ -82,38 +82,18 @@ sentencepiece = ["sentencepiece>=0.1.98"]
 sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 sparsify = ["sparsify"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
+unitxt = ["unitxt==1.22.0"]
 vllm = ["vllm>=0.4.2"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 zeno = ["pandas", "zeno-client"]
-all = [
+tasks = [
     "lm_eval[acpbench]",
-    "lm_eval[api]",
-    "lm_eval[audiolm_qwen]",
-    "lm_eval[deepsparse]",
-    "lm_eval[dev]",
-    "lm_eval[gptq]",
-    "lm_eval[gptqmodel]",
-    "lm_eval[hf_transfer]",
-    "lm_eval[ibm_watsonx_ai]",
     "lm_eval[ifeval]",
-    "lm_eval[ipex]",
     "lm_eval[japanese_leaderboard]",
     "lm_eval[longbench]",
-    "lm_eval[mamba]",
     "lm_eval[math]",
     "lm_eval[multilingual]",
-    "lm_eval[neuronx]",
-    "lm_eval[optimum]",
-    "lm_eval[promptsource]",
     "lm_eval[ruler]",
-    "lm_eval[sae_lens]",
-    "lm_eval[sentencepiece]",
-    "lm_eval[sparseml]",
-    "lm_eval[sparsify]",
-    "lm_eval[testing]",
-    "lm_eval[vllm]",
-    "lm_eval[wandb]",
-    "lm_eval[zeno]",
 ]
 
 [tool.pymarkdown]