huggingface
diff --git a/‎docs/source/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/index.mdx‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/index.mdx‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/offline-evaluation.md‎
Lines changed: 46 additions & 0 deletions b/‎docs/source/offline-evaluation.md‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎examples/custom_tasks_templates/custom_yourbench_task_from_files.py‎
Lines changed: 105 additions & 0 deletions b/‎examples/custom_tasks_templates/custom_yourbench_task_from_files.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎src/lighteval/metrics/metrics_sample.py‎
Lines changed: 4 additions & 5 deletions b/‎src/lighteval/metrics/metrics_sample.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/lighteval/metrics/utils/llm_as_judge.py‎
Lines changed: 5 additions & 0 deletions b/‎src/lighteval/metrics/utils/llm_as_judge.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/lighteval/tasks/lighteval_task.py‎
Lines changed: 6 additions & 1 deletion b/‎src/lighteval/tasks/lighteval_task.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/lighteval/tasks/prompt_manager.py‎
Lines changed: 1 addition & 1 deletion b/‎src/lighteval/tasks/prompt_manager.py‎
Lines changed: 1 addition & 1 deletion
@@ -17,6 +17,8 @@
     title: Use the Python API
   - local: adding-a-custom-task
     title: Add a custom task
+  - local: offline-evaluation
+    title: Offline evaluation
   - local: adding-a-new-metric
     title: Add a custom metric
   - local: evaluating-a-custom-model
 
@@ -5,6 +5,9 @@
 performance by saving and exploring detailed, sample-by-sample results to debug
 and see how your models stack up.
 
+> [!TIP]
+> Share your evaluation results with the community by pushing them to the Hugging Face Hub. If you open Pull Requests on model repositories with evaluation results, we will automatically show the results on benchmark dataset repositories. Let's decentralize evaluation! Check out the [docs](https://huggingface.co/docs/hub/eval-results).
+
 ## Key Features
 
 ### 🚀 **Multi-Backend Support**
 
@@ -0,0 +1,46 @@
+# Offline evaluation using local data files
+
+If you are prototyping a task based on files that are not yet hosted on the
+Hub, you can take advantage of the `hf_data_files` argument to point lighteval
+at local JSON/CSV resources. This makes it easy to evaluate datasets that live
+in your repo or that are generated on the fly.
+
+Internally, `hf_data_files` is passed directly to the `data_files` parameter of `datasets.load_dataset` ([docs]((https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset))).
+
+See [adding a custom task](adding-a-custom-task) for more information on how to create a custom task.
+
+```python
+from pathlib import Path
+
+from lighteval.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+def local_prompt(line: dict, task_name: str) -> Doc:
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=line["choices"],
+        gold_index=line["answer"]
+    )
+
+
+local_data = Path(__file__).parent / "samples" / "faq.jsonl"
+
+local_task = LightevalTaskConfig(
+    name="faq_eval",
+    prompt_function=local_prompt,
+    hf_repo="json",  # Built-in streaming loader for json/jsonl files
+    hf_subset="default",
+    hf_data_files=str(local_data),  # Can also be a dict mapping split names to paths
+    evaluation_splits=["train"],
+    metrics=[Metrics.ACCURACY],
+)
+```
+
+Once the config is registered in `TASKS_TABLE`, running the task with
+`--custom-tasks path/to/your_file.py` will automatically load the local data
+files. You can also pass a dictionary to `hf_data_files` (e.g.
+`{"train": "train.jsonl", "validation": "val.jsonl"}`) to expose multiple
+splits.
@@ -0,0 +1,105 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import logging
+import tempfile
+from functools import partial
+from pathlib import Path
+
+from custom_yourbench_task_mcq import yourbench_prompt
+from datasets import Dataset, DatasetDict
+
+from lighteval.metrics.metrics import Metrics
+from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+
+
+logger = logging.getLogger(__name__)
+
+save_dir = str(tempfile.mkdtemp())
+
+ds = DatasetDict(
+    {
+        "train": Dataset.from_dict(
+            {
+                "question": ["What is 2+2?", "Capital of France?"],
+                "choices": [["1", "2", "3", "4"], ["Paris", "Berlin", "Rome", "Madrid"]],
+                "gold": [[3], [0]],
+            }
+        )
+    }
+)
+
+
+CustomTaskConfig = partial(
+    LightevalTaskConfig,
+    prompt_function=yourbench_prompt,
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=16,
+    metrics=[Metrics.gpqa_instruct_metric],
+    version=0,
+)
+
+# Example 1: save to disk (huggingface format) ####
+
+ds.save_to_disk(save_dir)
+
+yourbench_mcq = CustomTaskConfig(
+    name="tiny_mcqa_dataset",
+    hf_repo="arrow",
+    hf_subset="default",
+    hf_data_files=f"{save_dir}/**/*.arrow",
+)
+
+task = LightevalTask(yourbench_mcq)
+eval_docs = task.eval_docs()
+
+print("\n>>READING TASK FROM ARROW<<")
+for doc in eval_docs:
+    print(doc)
+
+
+# Example 2: jsonlines format ####
+
+jsonl_path = Path(save_dir) / "train.jsonl"
+with open(jsonl_path, "w") as f:
+    for row in ds["train"]:
+        f.write(json.dumps(row) + "\n")
+
+yourbench_mcq = CustomTaskConfig(
+    name="tiny_mcqa_dataset",
+    hf_repo="json",
+    hf_subset="default",
+    hf_data_files=str(jsonl_path),
+)
+
+task = LightevalTask(yourbench_mcq)
+eval_docs = task.eval_docs()
+
+print("\n>>READING TASK FROM JSONLINES<<")
+for doc in eval_docs:
+    print(doc)
+
+# TASKS_TABLE = [yourbench_mcq]
@@ -951,29 +951,28 @@ def __init__(
         short_judge_name: str | None = None,
         response_format: BaseModel | None = None,
         url: str | None = None,
+        api_key: str | None = None,
         hf_provider: str | None = None,
         max_tokens: int | None = None,
         backend_options: dict | None = None,
     ) -> None:
         logger.debug(f"Initializing JudgeLLM with backend: {judge_backend}, model: {judge_model_name}")
 
-        api_key = None
-
         match judge_backend:
             case "openai":
                 if judge_model_name not in self.available_models_openai:
                     raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
-                api_key = os.getenv("OPENAI_API_KEY")
+                api_key = api_key or os.getenv("OPENAI_API_KEY")
                 logger.debug("Using OpenAI backend for llm as a judge metric")
 
             case "tgi":
-                api_key = os.getenv("HF_TOKEN")
+                api_key = api_key or os.getenv("HF_TOKEN")
                 if url is None:
                     url = "https://api-inference.huggingface.co/v1/"
                 logger.debug("Using TGI backend")
 
             case "inference-providers":
-                api_key = os.getenv("HF_TOKEN")
+                api_key = api_key or os.getenv("HF_TOKEN")
                 logger.debug("Using Hugging Face Inference backend")
 
             case "litellm":
 
@@ -326,9 +326,14 @@ def __call_api(prompt):
                         "messages": prompt,
                         "n": 1,
                         "caching": True,
+                        "response_format": self.response_format,
                     }
                     if max_new_tokens is not None:
                         kwargs["max_tokens"] = (max_new_tokens,)
+                    if self.api_key is not None:
+                        kwargs["api_key"] = self.api_key
+                    if self.url is not None:
+                        kwargs["base_url"] = self.url
 
                     response = litellm.completion(**kwargs)
                     text = response.choices[0].message.content
 
@@ -24,7 +24,7 @@
 import logging
 import random
 from dataclasses import asdict, dataclass, field
-from typing import Callable
+from typing import Callable, Mapping, Sequence
 
 from datasets import DatasetDict, load_dataset
 from huggingface_hub import TextGenerationInputGrammarType
@@ -59,6 +59,8 @@ class LightevalTaskConfig:
             row to Doc objects for evaluation. Takes a dataset row dict and task
             name as input.
         hf_repo (str): HuggingFace Hub repository path containing the evaluation dataset.
+        hf_data_files (str | Sequence[str] | Mapping[str, str | Sequence[str]] | None):
+            Data files to load. Same as `data_files` argument of `datasets.load_dataset`.
         hf_subset (str): Dataset subset/configuration name to use for this task.
         metrics (ListLike[Metric | Metrics]): List of metrics or metric enums to compute for this task.
 
@@ -113,6 +115,7 @@ class LightevalTaskConfig:
     hf_repo: str
     hf_subset: str
     metrics: ListLike[Metric | Metrics]  # Accept both Metric objects and Metrics enums
+    hf_data_files: str | Sequence[str] | Mapping[str, str | Sequence[str]] | None = None
 
     # Inspect AI compatible parameters
     solver: None = None
@@ -219,6 +222,7 @@ def __init__(
 
         # Dataset info
         self.dataset_path = config.hf_repo
+        self.data_files = config.hf_data_files
         self.dataset_config_name = config.hf_subset
         self.dataset_revision = config.hf_revision
         self.dataset_filter = config.hf_filter
@@ -454,6 +458,7 @@ def download_dataset_worker(
             path=task.dataset_path,
             name=task.dataset_config_name,
             revision=task.dataset_revision,
+            data_files=task.data_files,
         )
 
         if task.dataset_filter is not None:
 
@@ -206,7 +206,7 @@ def __init__(self, task: "LightevalTask"):
 
         if few_shots_select not in ALLOWED_SELECTIONS:
             raise ValueError(
-                f"few_shots_select must be one of f{','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"
+                f"few_shots_select must be one of {','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"
             )
 
         self.few_shots_select = FewShotSelection[few_shots_select]
Original file line number	Diff line number	Diff line change
`@@ -206,7 +206,7 @@ def __init__(self, task: "LightevalTask"):`
`206`	`206`
`207`	`207`	`if few_shots_select not in ALLOWED_SELECTIONS:`
`208`	`208`	`raise ValueError(`
`209`		`- f"few_shots_select must be one of f{','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"`
	`209`	`+ f"few_shots_select must be one of {','.join(ALLOWED_SELECTIONS[:-1])} or {ALLOWED_SELECTIONS[-1]}, not {few_shots_select}"`
`210`	`210`	`)`
`211`	`211`
`212`	`212`	`self.few_shots_select = FewShotSelection[few_shots_select]`