Skip to content

Commit 35babcb

Browse files
authored
Remove suites in task configs example and fix task with hf_filters (#1051)
* remove suites and make fewshot optional * fix docs to remove suites and fewshots * fix tests * fix tests * fix tests * fix tests * fix tests * fix tests * fix tests * Remove suite argument iin task config * Remove suite argument iin task config * fix try to cache functool.partial function * fix styling
1 parent 6524c6a commit 35babcb

File tree

7 files changed

+14
-28
lines changed

7 files changed

+14
-28
lines changed

docs/source/contributing-to-multilingual-evaluations.mdx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,6 @@ your_tasks = [
147147
LightevalTaskConfig(
148148
# Name of your evaluation
149149
name=f"evalname_{language.value}_{formulation.name.lower()}",
150-
# The evaluation is community contributed
151-
suite=["community"],
152150
# This will automatically get the correct metrics for your chosen formulation
153151
metric=get_metrics_for_formulation(
154152
formulation,

docs/source/quicktour.mdx

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,6 @@ lighteval accelerate \
6060

6161
### Task Specification
6262

63-
The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
64-
65-
```txt
66-
{suite}|{task}|{num_few_shot}
67-
```
68-
6963
Tasks have a function applied at the sample level and one at the corpus level. For example,
7064
- an exact match can be applied per sample, then averaged over the corpus to give the final score
7165
- samples can be left untouched before applying Corpus BLEU at the corpus level
@@ -74,7 +68,7 @@ etc.
7468
If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
7569
For example
7670
```txt
77-
{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
71+
{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
7872
```
7973

8074
All officially supported tasks can be found at the [tasks_list](available-tasks) and in the

docs/source/saving-and-reading-results.mdx

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,9 +247,6 @@ The main results file contains several sections:
247247
"Question="
248248
],
249249
"num_samples": null,
250-
"suite": [
251-
"lighteval"
252-
],
253250
"original_num_docs": 1319,
254251
"effective_num_docs": 1,
255252
"must_remove_duplicate_docs": null,

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,6 @@ def __init__(
300300
evaluation_splits=["test"],
301301
few_shots_split=None,
302302
few_shots_select=None,
303-
suite=["custom"],
304303
generation_size=40,
305304
stop_sequence=None,
306305
):
@@ -314,7 +313,6 @@ def __init__(
314313
evaluation_splits=evaluation_splits,
315314
few_shots_split=few_shots_split,
316315
few_shots_select=few_shots_select,
317-
suite=suite,
318316
generation_size=generation_size,
319317
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
320318
)
@@ -401,7 +399,6 @@ def __init__(
401399
evaluation_splits=["test"],
402400
few_shots_split="dev",
403401
few_shots_select=None,
404-
suite=None,
405402
generation_size=-1,
406403
stop_sequence=None,
407404
):
@@ -415,7 +412,6 @@ def __init__(
415412
evaluation_splits=evaluation_splits,
416413
few_shots_split=few_shots_split,
417414
few_shots_select=few_shots_select,
418-
suite=suite,
419415
generation_size=generation_size,
420416
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
421417
)
@@ -512,7 +508,6 @@ def __init__(
512508
evaluation_splits=["train"],
513509
few_shots_split="train",
514510
few_shots_select=None,
515-
suite=None,
516511
generation_size=4,
517512
stop_sequence=None,
518513
):
@@ -526,7 +521,6 @@ def __init__(
526521
evaluation_splits=evaluation_splits,
527522
few_shots_split=few_shots_split,
528523
few_shots_select=few_shots_select,
529-
suite=suite,
530524
generation_size=generation_size,
531525
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
532526
)
@@ -646,7 +640,6 @@ def __init__(
646640
evaluation_splits=["train"],
647641
few_shots_split="validation",
648642
few_shots_select=None,
649-
suite=None,
650643
generation_size=-1,
651644
stop_sequence=None,
652645
):
@@ -660,7 +653,6 @@ def __init__(
660653
evaluation_splits=evaluation_splits,
661654
few_shots_split=few_shots_split,
662655
few_shots_select=few_shots_select,
663-
suite=suite,
664656
generation_size=generation_size,
665657
stop_sequence=(stop_sequence if stop_sequence is not None else ["\n"]),
666658
)

examples/nanotron/custom_task.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def mmlu_anatomy(line):
7171
TASKS_TABLE = [
7272
LightevalTaskConfig(
7373
name="mmlu:anatomy",
74-
suite=["custom"],
7574
prompt_function=mmlu_anatomy,
7675
hf_repo="lighteval/mmlu",
7776
hf_subset="anatomy",
@@ -85,7 +84,6 @@ def mmlu_anatomy(line):
8584
),
8685
LightevalTaskConfig(
8786
name="mmlu:anatomy_signs",
88-
suite=["custom"],
8987
prompt_function=mmlu_anatomy_signs,
9088
hf_repo="lighteval/mmlu",
9189
hf_subset="anatomy",

src/lighteval/cli_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class Arg:
243243
type=Annotated[
244244
str,
245245
Argument(
246-
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'suite|task|version|split'. Use 'lighteval tasks list' to see available tasks."
246+
help="Comma-separated list of tasks to evaluate. Format: 'task1,task2' or 'task{|fewshot}'. Use 'lighteval tasks list' to see available tasks."
247247
),
248248
],
249249
default=None, # Required argument, no default

src/lighteval/tasks/lighteval_task.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
import functools
2324
import logging
2425
import random
2526
from dataclasses import asdict, dataclass, field
@@ -155,7 +156,7 @@ def __post_init__(self):
155156
self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
156157
self.full_name = f"{self.name}|{self.num_fewshots}" # todo clefourrier: this is likely incorrect
157158

158-
def __str__(self, lite: bool = False):
159+
def __str__(self, lite: bool = False): # noqa: C901
159160
md_writer = MarkdownTableWriter()
160161
md_writer.headers = ["Key", "Value"]
161162

@@ -170,17 +171,23 @@ def __str__(self, lite: bool = False):
170171
if k == "metrics":
171172
for ix, metrics in enumerate(v):
172173
for metric_k, metric_v in metrics.items():
173-
if isinstance(metric_v, Callable):
174-
repr_v = metric_v.__name__
174+
if isinstance(metric_v, functools.partial):
175+
func_name = getattr(metric_v.func, "__name__", str(metric_v.func))
176+
repr_v = f"partial({func_name}, ...)"
177+
elif isinstance(metric_v, Callable):
178+
repr_v = getattr(metric_v, "__name__", repr(metric_v))
175179
elif isinstance(metric_v, Metric.get_allowed_types_for_metrics()):
176180
repr_v = str(metric_v)
177181
else:
178182
repr_v = repr(metric_v)
179183
values.append([f"{k} {ix}: {metric_k}", repr_v])
180184

181185
else:
182-
if isinstance(v, Callable):
183-
values.append([k, v.__name__])
186+
if isinstance(v, functools.partial):
187+
func_name = getattr(v.func, "__name__", str(v.func))
188+
values.append([k, f"partial({func_name}, ...)"])
189+
elif isinstance(v, Callable):
190+
values.append([k, getattr(v, "__name__", repr(v))])
184191
else:
185192
values.append([k, repr(v)])
186193

0 commit comments

Comments
 (0)