Remove suites in task configs example and fix task with hf_filters (#1051)

NathanHB · web-flow · commit 35babcb14f15 · 2025-11-12T14:16:23.000+01:00
* remove suites and make fewshot optional

* fix docs to remove suites and fewshots

* fix tests

* fix tests

* fix tests

* fix tests

* fix tests

* fix tests

* fix tests

* Remove suite argument iin task config

* Remove suite argument iin task config

* fix try to cache functool.partial function

* fix styling
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -147,8 +147,6 @@ your_tasks = [
     LightevalTaskConfig(
         # Name of your evaluation
         name=f"evalname_{language.value}_{formulation.name.lower()}",
-        # The evaluation is community contributed
-        suite=["community"],
         # This will automatically get the correct metrics for your chosen formulation
         metric=get_metrics_for_formulation(
             formulation,
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
@@ -60,12 +60,6 @@ lighteval accelerate \
 
 ### Task Specification
 
-The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
-
-```txt
-{suite}|{task}|{num_few_shot}
-```
-
 Tasks have a function applied at the sample level and one at the corpus level. For example,
 - an exact match can be applied per sample, then averaged over the corpus to give the final score
 - samples can be left untouched before applying Corpus BLEU at the corpus level
@@ -74,7 +68,7 @@ etc.
 If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
 For example
 ```txt
-{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
+{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
 ```
 
 All officially supported tasks can be found at the [tasks_list](available-tasks) and in the
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -247,9 +247,6 @@ The main results file contains several sections:
         "Question="
       ],
       "num_samples": null,
-      "suite": [
-        "lighteval"
-      ],
       "original_num_docs": 1319,
       "effective_num_docs": 1,
       "must_remove_duplicate_docs": null,
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
@@ -300,7 +300,6 @@ def __init__(
         evaluation_splits=["test"],
         few_shots_split=None,
         few_shots_select=None,
-        suite=["custom"],
         generation_size=40,
         stop_sequence=None,
     ):
@@ -314,7 +313,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -401,7 +399,6 @@ def __init__(
         evaluation_splits=["test"],
         few_shots_split="dev",
         few_shots_select=None,
-        suite=None,
         generation_size=-1,
         stop_sequence=None,
     ):
@@ -415,7 +412,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -512,7 +508,6 @@ def __init__(
         evaluation_splits=["train"],
         few_shots_split="train",
         few_shots_select=None,
-        suite=None,
         generation_size=4,
         stop_sequence=None,
     ):
@@ -526,7 +521,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
@@ -646,7 +640,6 @@ def __init__(
         evaluation_splits=["train"],
         few_shots_split="validation",
         few_shots_select=None,
-        suite=None,
         generation_size=-1,
         stop_sequence=None,
     ):
@@ -660,7 +653,6 @@ def __init__(
             evaluation_splits=evaluation_splits,
             few_shots_split=few_shots_split,
             few_shots_select=few_shots_select,
-            suite=suite,
             generation_size=generation_size,
             stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
         )
diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py
@@ -71,7 +71,6 @@ def mmlu_anatomy(line):
 TASKS_TABLE = [
     LightevalTaskConfig(
         name="mmlu:anatomy",
-        suite=["custom"],
         prompt_function=mmlu_anatomy,
         hf_repo="lighteval/mmlu",
         hf_subset="anatomy",
@@ -85,7 +84,6 @@ def mmlu_anatomy(line):
     ),
     LightevalTaskConfig(
         name="mmlu:anatomy_signs",
-        suite=["custom"],
         prompt_function=mmlu_anatomy_signs,
         hf_repo="lighteval/mmlu",
         hf_subset="anatomy",
diff --git a/src/lighteval/cli_args.py b/src/lighteval/cli_args.py
@@ -243,7 +243,7 @@ class Arg:
     type=Annotated[
         str,
         Argument(
-            help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
+            help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks."
         ),
     ],
     default=None,  # Required argument, no default
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import functools
 import logging
 import random
 from dataclasses import asdict, dataclass, field
@@ -155,7 +156,7 @@ def __post_init__(self):
         self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
         self.full_name = f"{self.name}|{self.num_fewshots}"  # todo clefourrier: this is likely incorrect
 
-    def __str__(self, lite: bool = False):
+    def __str__(self, lite: bool = False):  # noqa: C901
         md_writer = MarkdownTableWriter()
         md_writer.headers = ["Key", "Value"]
 
@@ -170,17 +171,23 @@ def __str__(self, lite: bool = False):
             if k == "metrics":
                 for ix, metrics in enumerate(v):
                     for metric_k, metric_v in metrics.items():
-                        if isinstance(metric_v, Callable):
-                            repr_v = metric_v.__name__
+                        if isinstance(metric_v, functools.partial):
+                            func_name = getattr(metric_v.func, "__name__", str(metric_v.func))
+                            repr_v = f"partial({func_name}, ...)"
+                        elif isinstance(metric_v, Callable):
+                            repr_v = getattr(metric_v, "__name__", repr(metric_v))
                         elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()):
                             repr_v = str(metric_v)
                         else:
                             repr_v = repr(metric_v)
                         values.append([f"{k} {ix}: {metric_k}", repr_v])
 
             else:
-                if isinstance(v, Callable):
-                    values.append([k, v.__name__])
+                if isinstance(v, functools.partial):
+                    func_name = getattr(v.func, "__name__", str(v.func))
+                    values.append([k, f"partial({func_name}, ...)"])
+                elif isinstance(v, Callable):
+                    values.append([k, getattr(v, "__name__", repr(v))])
                 else:
                     values.append([k, repr(v)])