Merge branch 'main' into clem_homogeneize_generation_params

clefourrier · web-flow · commit e2d512be2282 · 2024-12-26T12:35:30.000+01:00
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -18,7 +18,6 @@ jobs:
        uses: actions/checkout@v3
        with:
         lfs: 'true'
-        ref: ${{ github.event.pull_request.head.sha }} # we want to test against our branch not against a merge commit
      - name: Setup Python environment
        uses: actions/setup-python@v4
        with:
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
@@ -16,4 +16,3 @@ jobs:
         fetch-depth: 0
     - name: Secret Scanning
       uses: trufflesecurity/trufflehog@main
-
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -86,7 +86,6 @@ def arabic_mmlu_pfn(line, task_name: str = None):
         choices=valid_keys_arabic,  # Return only valid choices (Arabic keys)
         gold_index=answer_index,  # Correct index in the valid Arabic keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_arabic[answer_index],  # Correct answer in Arabic form
     )
 
 
@@ -149,7 +148,6 @@ def arabic_mmlu_ht_pfn(line, task_name: str = None):
         choices=[str(i) for i in range(1, len(choices) + 1)],  # List of strings instead of ints
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=str(answer_index),  # Assuming it's sorted based on the number
     )
 
 
@@ -328,7 +326,6 @@ def aratrust_pfn(line, task_name: str = None):
         choices=LETTER_INDICES_AR[:3],
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index],
     )
 
 
@@ -413,7 +410,8 @@ def arabic_exams_pfn(line, task_name: str = None):
 def alghafa_pfn(line, task_name: str = None):
     question = line["query"]
     answer_index = int(line["label"])
-    choices = [line[key] for key in ["sol1", "sol2", "sol3", "sol4"]]
+    allowed_keys = [f"sol{i}" for i in range(1, 6)]
+    choices = [line[key] for key in allowed_keys if key in line]
 
     instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n"
     query = f"{instruction}السؤال: {question}\n"
@@ -802,7 +800,6 @@ def madinah_qa_pfn(line, task_name: str = None):
         choices=choices,
         gold_index=answer_index,  # Correct index in the valid keys
         instruction=instruction,
-        target_for_fewshot_sorting=valid_keys_latin[answer_index],  # Correct answer in Latin form
     )
 
 
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/adding-a-new-metric.mdx
@@ -92,4 +92,3 @@ if __name__ == "__main__":
 
 You can then give your custom metric to lighteval by using `--custom-tasks
 path_to_your_file` when launching it.
-
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/contributing-to-multilingual-evaluations.mdx
@@ -8,7 +8,7 @@ We welcome translations in your language!
 
 To contribute, you'll need to
 1. Open the [translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py) file
-2. Edit the file to add or expand the literal for your language of interest. 
+2. Edit the file to add or expand the literal for your language of interest.
 
 ```python
     Language.ENGLISH: TranslationLiterals(
@@ -42,7 +42,7 @@ To contribute, you'll need to
 
 ## Contributing a new multilingual task
 
-You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use. 
+You should first read our guide on [adding a custom task](adding-a-custom-task), to better understand the different parameters we use.
 
 Then, you should take a look at the current [multilingual tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py) file, to understand how they are defined. For multilingual evaluations the `prompt_function` should be implemented by language-adapted template. The template will take care of correct formatting, correct and consistent usage of language adjusted prompt anchors (e.g Question/Answer) and punctuation.
 
@@ -58,7 +58,7 @@ your_tasks = [
     LightevalTaskConfig(
         # Name of your evaluation
         name=f"evalname_{language.value}_{formulation.name.lower()}",
-        # The evaluation is community contributed 
+        # The evaluation is community contributed
         suite=["community"],
         # This will automatically get the correct metrics for your chosen formulation
         metric=get_metrics_for_formulation(
@@ -72,7 +72,7 @@ your_tasks = [
         # In this function, you choose which template to follow and for which language and formulation
         prompt_function=get_template_prompt_function(
             language=language,
-            # then use the adapter to define the mapping between the 
+            # then use the adapter to define the mapping between the
             # keys of the template (left), and the keys of your dataset
             # (right)
             # To know which template keys are required and available,
@@ -83,9 +83,9 @@ your_tasks = [
             },
             formulation=formulation,
         ),
-        # You can also add specific filters to remove irrelevant samples 
+        # You can also add specific filters to remove irrelevant samples
         hf_filter=lambda line: line["label"] in <condition>,
-        # You then select your huggingface dataset as well as 
+        # You then select your huggingface dataset as well as
         # the splits available for evaluation
         hf_repo=<dataset>,
         hf_subset=<subset>,
diff --git a/docs/source/package_reference/logging.mdx b/docs/source/package_reference/logging.mdx
@@ -1,4 +1,7 @@
-# Loggers
+# Logging
+
+## EvaluationTracker
+[[autodoc]] logging.evaluation_tracker.EvaluationTracker
 
 ## GeneralConfigLogger
 [[autodoc]] logging.info_loggers.GeneralConfigLogger
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
@@ -35,7 +35,7 @@ def main():
         env_config=EnvConfig(cache_dir="tmp/"),
         # Remove the 2 parameters below once your configuration is tested
         override_batch_size=1,
-        max_samples=10 
+        max_samples=10
     )
 
     model_config = VLLMModelConfig(
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -20,7 +20,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-import copy
 import json
 import logging
 import os
@@ -82,16 +81,35 @@ def default(self, o):
 
 
 class EvaluationTracker:
-    """
-    Keeps track of the overall evaluation process and relevant informations.
+    """Keeps track of the overall evaluation process and relevant information.
 
-    The [`EvaluationTracker`] contains specific loggers for experiments details
-    ([`DetailsLogger`]), metrics ([`MetricsLogger`]), task versions
-    ([`VersionsLogger`]) as well as for the general configurations of both the
-    specific task ([`TaskConfigLogger`]) and overall evaluation run
-    ([`GeneralConfigLogger`]).  It compiles the data from these loggers and
+    The [`~logging.evaluation_tracker.EvaluationTracker`] contains specific loggers for experiments details
+    ([`~logging.evaluation_tracker.DetailsLogger`]), metrics ([`~logging.evaluation_tracker.MetricsLogger`]), task versions
+    ([`~logging.evaluation_tracker.VersionsLogger`]) as well as for the general configurations of both the
+    specific task ([`~logging.evaluation_tracker.TaskConfigLogger`]) and overall evaluation run
+    ([`~logging.evaluation_tracker.GeneralConfigLogger`]).  It compiles the data from these loggers and
     writes it to files, which can be published to the Hugging Face hub if
     requested.
+
+    Args:
+        output_dir (`str`): Local folder path where you want results to be saved.
+        save_details (`bool`, defaults to True): If True, details are saved to the `output_dir`.
+        push_to_hub (`bool`, defaults to False): If True, details are pushed to the hub.
+            Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
+            if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
+        push_to_tensorboard (`bool`, defaults to False): If True, will create and push the results for a tensorboard folder on the hub.
+        hub_results_org (`str`, *optional*): The organisation to push the results to.
+            See more details about the datasets organisation in [`EvaluationTracker.save`].
+        tensorboard_metric_prefix (`str`, defaults to "eval"): Prefix for the metrics in the tensorboard logs.
+        public (`bool`, defaults to False): If True, results and details are pushed to public orgs.
+        nanotron_run_info ([`~nanotron.config.GeneralArgs`], *optional*): Reference to information about Nanotron models runs.
+
+    **Attributes**:
+        - **details_logger** ([`~logging.info_loggers.DetailsLogger`]) -- Logger for experiment details.
+        - **metrics_logger** ([`~logging.info_loggers.MetricsLogger`]) -- Logger for experiment metrics.
+        - **versions_logger** ([`~logging.info_loggers.VersionsLogger`]) -- Logger for task versions.
+        - **general_config_logger** ([`~logging.info_loggers.GeneralConfigLogger`]) -- Logger for general configuration.
+        - **task_config_logger** ([`~logging.info_loggers.TaskConfigLogger`]) -- Logger for task configuration.
     """
 
     def __init__(
@@ -105,23 +123,7 @@ def __init__(
         public: bool = False,
         nanotron_run_info: "GeneralArgs" = None,
     ) -> None:
-        """
-        Creates all the necessary loggers for evaluation tracking.
-
-        Args:
-            output_dir (str): Local folder path where you want results to be saved
-            save_details (bool): If True, details are saved to the output_dir
-            push_to_hub (bool): If True, details are pushed to the hub.
-                Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
-                if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
-            push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub
-            hub_results_org (str): The organisation to push the results to. See
-                more details about the datasets organisation in
-                [`EvaluationTracker.save`]
-            tensorboard_metric_prefix (str): Prefix for the metrics in the tensorboard logs
-            public (bool): If True, results and details are pushed in private orgs
-            nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs
-        """
+        """Creates all the necessary loggers for evaluation tracking."""
         self.details_logger = DetailsLogger()
         self.metrics_logger = MetricsLogger()
         self.versions_logger = VersionsLogger()
@@ -153,8 +155,7 @@ def save(self) -> None:
         date_id = datetime.now().isoformat().replace(":", "-")
 
         # We first prepare data to save
-        config_general = copy.deepcopy(self.general_config_logger)
-        config_general = asdict(config_general)
+        config_general = asdict(self.general_config_logger)
         # We remove the config from logging, which contains context/accelerator objects
         config_general.pop("config")
 
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -31,7 +31,7 @@
 logger = logging.getLogger(__name__)
 
 TOKEN = os.getenv("HF_TOKEN")
-CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+CACHE_DIR: str = os.getenv("HF_HOME")
 
 HELP_PANEL_NAME_1 = "Common Parameters"
 HELP_PANEL_NAME_2 = "Logging Parameters"
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
@@ -148,10 +148,10 @@ def task_registry(self):
         intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys()))
         if len(intersection) > 0:
             logger.warning(
-                f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict."
+                f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the custom ones on conflict."
             )
 
-        # Defaults tasks should overwrite custom tasks
+        # Custom tasks overwrite defaults tasks
         return {**default_tasks_registry, **custom_tasks_registry}
 
     @property

Original file line number	Diff line number	Diff line change
`@@ -92,4 +92,3 @@ if __name__ == "__main__":`
`92`	`92`
`93`	`93`	You can then give your custom metric to lighteval by using `--custom-tasks
`94`	`94`	path_to_your_file` when launching it.
`95`		`-`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def main():`
`35`	`35`	`env_config=EnvConfig(cache_dir="tmp/"),`
`36`	`36`	`# Remove the 2 parameters below once your configuration is tested`
`37`	`37`	`override_batch_size=1,`
`38`		`- max_samples=10`
	`38`	`+ max_samples=10`
`39`	`39`	`)`
`40`	`40`
`41`	`41`	`model_config = VLLMModelConfig(`