Skip to content

Commit f2ec652

Browse files
authored
Merge branch 'main' into main
2 parents 3f26e42 + 06aee5b commit f2ec652

File tree

13 files changed

+973
-7
lines changed

13 files changed

+973
-7
lines changed

docs/source/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
title: Use the Python API
1818
- local: adding-a-custom-task
1919
title: Add a custom task
20+
- local: offline-evaluation
21+
title: Offline evaluation
2022
- local: adding-a-new-metric
2123
title: Add a custom metric
2224
- local: evaluating-a-custom-model

docs/source/index.mdx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
performance by saving and exploring detailed, sample-by-sample results to debug
66
and see how your models stack up.
77

8+
> [!TIP]
9+
> Share your evaluation results with the community by pushing them to the Hugging Face Hub. If you open Pull Requests on model repositories with evaluation results, we will automatically show the results on benchmark dataset repositories. Let's decentralize evaluation! Check out the [docs](https://huggingface.co/docs/hub/eval-results).
10+
811
## Key Features
912

1013
### 🚀 **Multi-Backend Support**

docs/source/offline-evaluation.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Offline evaluation using local data files
2+
3+
If you are prototyping a task based on files that are not yet hosted on the
4+
Hub, you can take advantage of the `hf_data_files` argument to point lighteval
5+
at local JSON/CSV resources. This makes it easy to evaluate datasets that live
6+
in your repo or that are generated on the fly.
7+
8+
Internally, `hf_data_files` is passed directly to the `data_files` parameter of `datasets.load_dataset` ([docs]((https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset))).
9+
10+
See [adding a custom task](adding-a-custom-task) for more information on how to create a custom task.
11+
12+
```python
13+
from pathlib import Path
14+
15+
from lighteval.metrics import Metrics
16+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
17+
from lighteval.tasks.requests import Doc
18+
19+
20+
def local_prompt(line: dict, task_name: str) -> Doc:
21+
return Doc(
22+
task_name=task_name,
23+
query=line["question"],
24+
choices=line["choices"],
25+
gold_index=line["answer"]
26+
)
27+
28+
29+
local_data = Path(__file__).parent / "samples" / "faq.jsonl"
30+
31+
local_task = LightevalTaskConfig(
32+
name="faq_eval",
33+
prompt_function=local_prompt,
34+
hf_repo="json", # Built-in streaming loader for json/jsonl files
35+
hf_subset="default",
36+
hf_data_files=str(local_data), # Can also be a dict mapping split names to paths
37+
evaluation_splits=["train"],
38+
metrics=[Metrics.ACCURACY],
39+
)
40+
```
41+
42+
Once the config is registered in `TASKS_TABLE`, running the task with
43+
`--custom-tasks path/to/your_file.py` will automatically load the local data
44+
files. You can also pass a dictionary to `hf_data_files` (e.g.
45+
`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
46+
splits.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
import json
24+
import logging
25+
import tempfile
26+
from functools import partial
27+
from pathlib import Path
28+
29+
from custom_yourbench_task_mcq import yourbench_prompt
30+
from datasets import Dataset, DatasetDict
31+
32+
from lighteval.metrics.metrics import Metrics
33+
from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
34+
35+
36+
logger = logging.getLogger(__name__)
37+
38+
save_dir = str(tempfile.mkdtemp())
39+
40+
ds = DatasetDict(
41+
{
42+
"train": Dataset.from_dict(
43+
{
44+
"question": ["What is 2+2?", "Capital of France?"],
45+
"choices": [["1", "2", "3", "4"], ["Paris", "Berlin", "Rome", "Madrid"]],
46+
"gold": [[3], [0]],
47+
}
48+
)
49+
}
50+
)
51+
52+
53+
CustomTaskConfig = partial(
54+
LightevalTaskConfig,
55+
prompt_function=yourbench_prompt,
56+
hf_avail_splits=["train"],
57+
evaluation_splits=["train"],
58+
few_shots_split=None,
59+
few_shots_select=None,
60+
generation_size=16,
61+
metrics=[Metrics.gpqa_instruct_metric],
62+
version=0,
63+
)
64+
65+
# Example 1: save to disk (huggingface format) ####
66+
67+
ds.save_to_disk(save_dir)
68+
69+
yourbench_mcq = CustomTaskConfig(
70+
name="tiny_mcqa_dataset",
71+
hf_repo="arrow",
72+
hf_subset="default",
73+
hf_data_files=f"{save_dir}/**/*.arrow",
74+
)
75+
76+
task = LightevalTask(yourbench_mcq)
77+
eval_docs = task.eval_docs()
78+
79+
print("\n>>READING TASK FROM ARROW<<")
80+
for doc in eval_docs:
81+
print(doc)
82+
83+
84+
# Example 2: jsonlines format ####
85+
86+
jsonl_path = Path(save_dir) / "train.jsonl"
87+
with open(jsonl_path, "w") as f:
88+
for row in ds["train"]:
89+
f.write(json.dumps(row) + "\n")
90+
91+
yourbench_mcq = CustomTaskConfig(
92+
name="tiny_mcqa_dataset",
93+
hf_repo="json",
94+
hf_subset="default",
95+
hf_data_files=str(jsonl_path),
96+
)
97+
98+
task = LightevalTask(yourbench_mcq)
99+
eval_docs = task.eval_docs()
100+
101+
print("\n>>READING TASK FROM JSONLINES<<")
102+
for doc in eval_docs:
103+
print(doc)
104+
105+
# TASKS_TABLE = [yourbench_mcq]

src/lighteval/metrics/metrics_sample.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -951,29 +951,28 @@ def __init__(
951951
short_judge_name: str | None = None,
952952
response_format: BaseModel | None = None,
953953
url: str | None = None,
954+
api_key: str | None = None,
954955
hf_provider: str | None = None,
955956
max_tokens: int | None = None,
956957
backend_options: dict | None = None,
957958
) -> None:
958959
logger.debug(f"Initializing JudgeLLM with backend: {judge_backend}, model: {judge_model_name}")
959960

960-
api_key = None
961-
962961
match judge_backend:
963962
case "openai":
964963
if judge_model_name not in self.available_models_openai:
965964
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
966-
api_key = os.getenv("OPENAI_API_KEY")
965+
api_key = api_key or os.getenv("OPENAI_API_KEY")
967966
logger.debug("Using OpenAI backend for llm as a judge metric")
968967

969968
case "tgi":
970-
api_key = os.getenv("HF_TOKEN")
969+
api_key = api_key or os.getenv("HF_TOKEN")
971970
if url is None:
972971
url = "https://api-inference.huggingface.co/v1/"
973972
logger.debug("Using TGI backend")
974973

975974
case "inference-providers":
976-
api_key = os.getenv("HF_TOKEN")
975+
api_key = api_key or os.getenv("HF_TOKEN")
977976
logger.debug("Using Hugging Face Inference backend")
978977

979978
case "litellm":

src/lighteval/metrics/utils/llm_as_judge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,14 @@ def __call_api(prompt):
326326
"messages": prompt,
327327
"n": 1,
328328
"caching": True,
329+
"response_format": self.response_format,
329330
}
330331
if max_new_tokens is not None:
331332
kwargs["max_tokens"] = (max_new_tokens,)
333+
if self.api_key is not None:
334+
kwargs["api_key"] = self.api_key
335+
if self.url is not None:
336+
kwargs["base_url"] = self.url
332337

333338
response = litellm.completion(**kwargs)
334339
text = response.choices[0].message.content

src/lighteval/tasks/lighteval_task.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import logging
2525
import random
2626
from dataclasses import asdict, dataclass, field
27-
from typing import Callable
27+
from typing import Callable, Mapping, Sequence
2828

2929
from datasets import DatasetDict, load_dataset
3030
from huggingface_hub import TextGenerationInputGrammarType
@@ -59,6 +59,8 @@ class LightevalTaskConfig:
5959
row to Doc objects for evaluation. Takes a dataset row dict and task
6060
name as input.
6161
hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
62+
hf_data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]] | None):
63+
Data files to load. Same as `data_files` argument of `datasets.load_dataset`.
6264
hf_subset (str): Dataset subset/configuration name to use for this task.
6365
metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
6466
@@ -113,6 +115,7 @@ class LightevalTaskConfig:
113115
hf_repo: str
114116
hf_subset: str
115117
metrics: ListLike[Metric | Metrics] # Accept both Metric objects and Metrics enums
118+
hf_data_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None
116119

117120
# Inspect AI compatible parameters
118121
solver: None = None
@@ -219,6 +222,7 @@ def __init__(
219222

220223
# Dataset info
221224
self.dataset_path = config.hf_repo
225+
self.data_files = config.hf_data_files
222226
self.dataset_config_name = config.hf_subset
223227
self.dataset_revision = config.hf_revision
224228
self.dataset_filter = config.hf_filter
@@ -454,6 +458,7 @@ def download_dataset_worker(
454458
path=task.dataset_path,
455459
name=task.dataset_config_name,
456460
revision=task.dataset_revision,
461+
data_files=task.data_files,
457462
)
458463

459464
if task.dataset_filter is not None:

src/lighteval/tasks/prompt_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def __init__(self, task: "LightevalTask"):
206206

207207
if few_shots_select not in ALLOWED_SELECTIONS:
208208
raise ValueError(
209-
f"few_shots_select must be one of f{','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"
209+
f"few_shots_select must be one of {','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"
210210
)
211211

212212
self.few_shots_select = FewShotSelection[few_shots_select]

0 commit comments

Comments
 (0)