fix lighteval task inspect command and tiny bench task (#992)

NathanHB · clefourrier · web-flow · commit b5cbd91a5f6a · 2025-11-05T11:24:43.000+01:00
* fix

* revert uneeded changes

---------

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
@@ -34,6 +34,7 @@
 @app.command()
 def inspect(
     tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")],
+    load_multilingual: Annotated[bool, Option(help="Whether to load multilingual tasks")] = False,
     custom_tasks: custom_tasks.type = custom_tasks.default,
     num_samples: Annotated[int, Option(help="Number of samples to display")] = 10,
     show_config: Annotated[bool, Option(help="Will display the full task config")] = False,
@@ -46,15 +47,15 @@ def inspect(
 
     from lighteval.tasks.registry import Registry
 
-    registry = Registry(custom_tasks=custom_tasks, load_multilingual=True)
+    registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_multilingual)
 
     # Loading task
     task_dict = registry.load_tasks()
     for name, task in task_dict.items():
         print("-" * 10, name, "-" * 10)
         if show_config:
             print("-" * 10, "CONFIG")
-            task.cfg.print()
+            task.config.print()
         for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
             if ix == 0:
                 print("-" * 10, "SAMPLES")
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -47,7 +47,7 @@
 
 class CorpusLevelComputation(ABC):
     @abstractmethod
-    def compute_corpus(self):
+    def compute_corpus(self, items):
         raise NotImplementedError
 
     def __str__(self):
diff --git a/src/lighteval/tasks/tasks/ifbench/instructions.py b/src/lighteval/tasks/tasks/ifbench/instructions.py
@@ -790,7 +790,7 @@ def check_following(self, value):
         """Checks if the response only includes words with prime length."""
         value = value.translate(str.maketrans("", "", string.punctuation))
         words = value.split()
-        primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97)
+        primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}
         for word in words:
             if len(word) not in primes:
                 return False
diff --git a/src/lighteval/tasks/tasks/tiny_benchmarks/main.py b/src/lighteval/tasks/tasks/tiny_benchmarks/main.py
@@ -27,16 +27,16 @@
 
 import numpy as np
 import requests
-from aenum import extend_enum
 from scipy.optimize import minimize
 
 import lighteval.tasks.default_prompts as prompt
-from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics
+from lighteval.metrics.metrics import CorpusLevelMetricGrouping
 from lighteval.metrics.metrics_corpus import CorpusLevelComputation
 from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation
 from lighteval.metrics.normalizations import gsm8k_normalizer
+from lighteval.models.model_output import ModelResponse
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
-from lighteval.tasks.requests import SamplingMethod
+from lighteval.tasks.requests import Doc, SamplingMethod
 
 
 # Utility functions
@@ -96,18 +96,18 @@ def download(self):
                 with open(path_dld, "wb") as file:
                     file.write(response.content)
 
-    def compute(self, **args):
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
         if self.task == "gsm8k":
             res = ExactMatches(
                 strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
-            ).compute(**args)
+            ).compute(doc, model_response, **kwargs)
             return dict.fromkeys(self.METRICS, res)
         else:
-            res = LoglikelihoodAcc().compute(**args)
+            res = LoglikelihoodAcc().compute(doc, model_response, **kwargs)
             return dict.fromkeys(self.METRICS, res)
 
-    def compute_corpus(self, y_input):
-        if len(y_input) == self.num_samples and self.estimates is not None:
+    def compute_corpus(self, items):
+        if len(items) == self.num_samples and self.estimates is not None:
             return self.estimates[self.task]
 
         # We load the weights for the relevant examples
@@ -144,7 +144,7 @@ def compute_corpus(self, y_input):
         # Creating vector y and estimating theta
         y = np.zeros(N)
         for i, j in enumerate(seen_examples):
-            y[j] = y_input[i]
+            y[j] = items[i]
 
         # Getting estimates
         theta = fit_theta(y, seen_examples, A, B)
@@ -170,7 +170,7 @@ def compute_corpus(self, y_input):
             estimates[scenario]["pirt"] = IRTp
             estimates[scenario]["gpirt"] = IRTpp
 
-        self.num_samples = len(y_input)
+        self.num_samples = len(items)
         self.estimates = estimates
 
         return estimates[self.task]
@@ -233,6 +233,25 @@ def compute_corpus(self, y_input):
     #    },
 ]
 
+metrics = {}
+
+for task_param in task_params:
+    name = task_param["name"]
+    if name == "gsm8k":
+        category = SamplingMethod.GENERATIVE
+    else:
+        category = SamplingMethod.LOGPROBS
+
+    metrics[f"tinybench_metric_{name}"] = (
+        CorpusLevelMetricGrouping(
+            metric_name=TinyCorpusAggregator.METRICS,
+            higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
+            sample_level_fn=TinyCorpusAggregator(name),
+            category=category,
+            corpus_level_fn=TinyCorpusAggregator(name),
+        ),
+    )
+
 TASKS_TABLE = []
 for task in task_params:
     name = task["name"]
@@ -251,28 +270,8 @@ def compute_corpus(self, y_input):
         evaluation_splits=task["evaluation_split"],
         few_shots_split=None,
         few_shots_select="random_sampling",
-        metrics=[f"tinybench_metric_{name}"],
+        metrics=metrics[f"tinybench_metric_{name}"],
         generation_size=generation_size,
         stop_sequence=stop_sequence,
     )
     TASKS_TABLE.append(task)
-
-# CUSTOM METRIC
-for task_param in task_params:
-    name = task_param["name"]
-    if name == "gsm8k":
-        category = SamplingMethod.GENERATIVE
-    else:
-        category = SamplingMethod.LOGPROBS
-
-    extend_enum(
-        Metrics,
-        f"tinybench_metric_{name}",
-        CorpusLevelMetricGrouping(
-            metric_name=TinyCorpusAggregator.METRICS,
-            higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
-            sample_level_fn=TinyCorpusAggregator(name),
-            category=category,
-            corpus_level_fn=TinyCorpusAggregator(name),
-        ),
-    )