Simplified system for extended tasks (#123)

clefourrier · NathanHB · web-flow · commit e941a5528d3a · 2024-03-27T15:46:23.000+01:00
---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
          cache: 'pip'
      - name: Install lighteval in editable mode
        run: |
-         pip install -e .[dev]
+         pip install -e .[dev,extended_tasks]
      - name: Get cached files
        uses: actions/cache@v2
        id: get-cache
diff --git a/README.md b/README.md
@@ -167,28 +167,38 @@ python run_evals_accelerate.py \
 
 Independently of the default tasks provided in `lighteval` that you will find in the `tasks_table.jsonl` file, you can use `lighteval` to evaluate models on tasks that require special processing (or have been added by the community). These tasks have their own evaluation suites and are defined as follows:
 
-* `extended`: tasks which have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended_tasks`](./extended_tasks) folder for examples.
+* `extended`: tasks which have complex pre- or post-processing and are added by the `lighteval` maintainers. See the [`extended_tasks`](./src/lighteval/tasks/extended_tasks) folder for examples.
 * `community`: tasks which have been added by the community. See the [`community_tasks`](./community_tasks) folder for examples.
 * `custom`: tasks which are defined locally and not present in the core library. Use this suite if you want to experiment with designing a special metric or task.
 
-For example, to run an extended task you can run:
+
+For example, to run an extended task like ifeval, you can run:
+```shell
+python run_evals_accelerate.py \
+    --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
+    --use_chat_template \ # optional, if you want to run the evaluation with the chat template
+    --tasks "extended|ifeval|0|0" \
+    --output_dir "./evals"
+```
+
+To run a community or custom task, you can use (note the custom_tasks flag):
 
 ```shell
 python run_evals_accelerate.py \
     --model_args="pretrained=<path to model on the hub>"\
     --tasks <task parameters> \
-    --extended_tasks "extended_tasks" \
+    --custom_tasks <path to your custom or community task> \
     --output_dir output_dir
 ```
 
-For example, to launch `lighteval` on `ifeval` for `HuggingFaceH4/zephyr-7b-beta`, run:
+For example, to launch `lighteval` on `arabic_mmlu:abstract_algebra` for `HuggingFaceH4/zephyr-7b-beta`, run:
 
 ```shell
 python run_evals_accelerate.py \
     --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
     --use_chat_template \ # optional, if you want to run the evaluation with the chat template
-    --tasks "extended|ifeval|0|0" \
-    --extended_tasks "extended_tasks" \
+    --tasks "community|arabic_mmlu:abstract_algebra|5|1" \
+    --custom_tasks "community_tasks/arabic_evals" \
     --output_dir "./evals"
 ```
 
@@ -209,7 +219,7 @@ However, we are very grateful to the Harness and HELM teams for their continued
         - [logging](https://github.com/huggingface/lighteval/tree/main/src/lighteval/logging): Our loggers, to display experiment information and push it to the hub after a run
         - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
         - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
-        - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`.
+        - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended).
 - [tasks_examples](https://github.com/huggingface/lighteval/tree/main/tasks_examples) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
 - [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.
 
@@ -252,9 +262,6 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t
 
 Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
 
-### Extended evaluations
-Proceed as for community evaluations, but in the `extended_tasks` folder.
-
 #### Community evaluations
 Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
 
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -104,12 +104,6 @@ def get_parser():
         default=None,
         help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
     )
-    parser.add_argument(
-        "--extended_tasks",
-        type=str,
-        default=None,
-        help="Path to the folder which contains all extended tasks",
-    )
     group.add_argument(
         "--tasks",
         type=str,
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -81,7 +81,7 @@ def main(args):
         with accelerator.main_process_first() if accelerator is not None else nullcontext():
             task_names_list, few_shots_dict = taskinfo_selector(args.tasks)
             task_dict = Registry(cache_dir=env_config.cache_dir).get_task_dict(
-                task_names_list, custom_tasks=args.custom_tasks, extended_tasks=args.extended_tasks
+                task_names_list, custom_tasks=args.custom_tasks
             )
             LightevalTask.load_datasets(task_dict.values(), args.dataset_loading_processes)
 
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
@@ -0,0 +1,33 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from lighteval.utils import can_load_extended_tasks
+
+
+if can_load_extended_tasks():
+    import lighteval.tasks.extended.ifeval.main as ifeval
+    import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
+
+    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks]
+
+else:
+    AVAILABLE_EXTENDED_TASKS_MODULES = []
diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py
@@ -23,7 +23,7 @@
 
 import langdetect
 
-import extended_tasks.ifeval.instructions_utils as instructions_util
+import lighteval.tasks.extended.ifeval.instructions_utils as instructions_util
 
 
 logger = logging.getLogger(__name__)
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_registry.py b/src/lighteval/tasks/extended/ifeval/instructions_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Registry of all instructions."""
-import extended_tasks.ifeval.instructions as instructions
+import lighteval.tasks.extended.ifeval.instructions as instructions
 
 
 _KEYWORD = "keywords:"
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_utils.py b/src/lighteval/tasks/extended/ifeval/instructions_utils.py
diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py
@@ -23,7 +23,7 @@
 import numpy as np
 from aenum import extend_enum
 
-import extended_tasks.ifeval.instructions_registry as instructions_registry
+import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import (
     MetricCategory,
diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
@@ -27,6 +27,7 @@
 Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"`
 """
 import os
+import pathlib
 import pickle
 
 import numpy as np
@@ -40,7 +41,6 @@
 from lighteval.metrics.normalizations import gsm8k_normalizer
 from lighteval.metrics.utils import MetricCategory, MetricUseCase
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import Doc
 
 
 # Utility functions
@@ -89,13 +89,15 @@ def __init__(self, task: str):
         self.num_samples = 0
 
     def download(self):
+        # Likely to crash in // processes if we don't include the pkl
+        path_dld = os.path.join(pathlib.Path(__file__).parent.resolve(), "tinyBenchmarks.pkl")
         # Downloading files
-        if not os.path.isfile("extended_tasks/tiny_benchmarks/tinyBenchmarks.pkl"):
+        if not os.path.isfile(path_dld):
             url = "https://raw.githubusercontent.com/felipemaiapolo/tinyBenchmarks/main/tinyBenchmarks/tinyBenchmarks.pkl"
             response = requests.get(url)
             if response.status_code == 200:
                 # Write the content to a file
-                with open("extended_tasks/tiny_benchmarks/tinyBenchmarks.pkl", "wb") as file:
+                with open(path_dld, "wb") as file:
                     file.write(response.content)
 
     def compute(self, **args):
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
@@ -32,7 +32,9 @@
 from datasets.load import dataset_module_factory
 
 from lighteval.logging.hierarchical_logger import hlog, hlog_warn
+from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES
 from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+from lighteval.utils import CANNOT_USE_EXTENDED_TASKS_MSG, can_load_extended_tasks
 
 
 # Helm, Bigbench, Harness are implementations following an evaluation suite setup
@@ -108,10 +110,7 @@ def get_task_class(
         )
 
     def get_task_dict(
-        self,
-        task_name_list: List[str],
-        custom_tasks: Optional[Union[str, ModuleType]] = None,
-        extended_tasks: str = None,
+        self, task_name_list: List[str], custom_tasks: Optional[Union[str, ModuleType]] = None
     ) -> Dict[str, LightevalTask]:
         """
         Get a dictionary of tasks based on the task name list.
@@ -134,11 +133,12 @@ def get_task_dict(
         TASKS_TABLE = []
         if custom_tasks is not None:
             custom_tasks_module.append(create_custom_tasks_module(custom_tasks=custom_tasks))
-        if extended_tasks is not None:
-            hlog_warn(
-                "You are using extended_tasks. Make sure you installed their dependencies using `pip install -e .[extended_tasks]`."
-            )
-            custom_tasks_module.extend(load_extended_tasks_modules(extended_tasks_path=extended_tasks))
+        if can_load_extended_tasks():
+            for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES:
+                custom_tasks_module.append(extended_task_module)
+        else:
+            hlog_warn(CANNOT_USE_EXTENDED_TASKS_MSG)
+
         for module in custom_tasks_module:
             TASKS_TABLE.extend(module.TASKS_TABLE)
 
@@ -155,16 +155,6 @@ def get_task_dict(
         return tasks_dict
 
 
-def load_extended_tasks_modules(extended_tasks_path: str):
-    all_modules = []
-    for folder in os.listdir(extended_tasks_path):
-        cur_module = create_custom_tasks_module(os.path.join(extended_tasks_path, folder, "main.py"))
-        hlog(f"Successfully loaded extended task: {folder}.")
-        all_modules.append(cur_module)
-
-    return all_modules
-
-
 def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleType:
     """Creates a custom task module to load tasks defined by the user in their own file.
 
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
@@ -189,3 +189,14 @@ def is_peft_available() -> bool:
 
 
 NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip."
+
+
+def can_load_extended_tasks() -> bool:
+    imports = []
+    for package in ["langdetect"]:
+        imports.append(importlib.util.find_spec(package))
+
+    return all(cur_import is not None for cur_import in imports)
+
+
+CANNOT_USE_EXTENDED_TASKS_MSG = "If you want to use extended_tasks, make sure you installed their dependencies using `pip install -e .[extended_tasks]`."

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ def main(args):`
`81`	`81`	`with accelerator.main_process_first() if accelerator is not None else nullcontext():`
`82`	`82`	`task_names_list, few_shots_dict = taskinfo_selector(args.tasks)`
`83`	`83`	`task_dict = Registry(cache_dir=env_config.cache_dir).get_task_dict(`
`84`		`- task_names_list, custom_tasks=args.custom_tasks, extended_tasks=args.extended_tasks`
	`84`	`+ task_names_list, custom_tasks=args.custom_tasks`
`85`	`85`	`)`
`86`	`86`	`LightevalTask.load_datasets(task_dict.values(), args.dataset_loading_processes)`
`87`	`87`