Just adding the custom metrics system (#65)

clefourrier · web-flow · commit 3785d8526e75 · 2024-02-28T13:21:10.000+01:00
diff --git a/README.md b/README.md
@@ -185,8 +185,28 @@ However, we are very grateful to the Harness and HELM teams for their continued
 
 ## Customisation
 ### Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`. If not, add it to either of these files depending on the level at which it is applied.
-Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
+First check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `src.lighteval.metrics.metrics_sample`.
+
+If not, you can use the custom_task system to register your new metric:
+- create a new python file which should contain the full logic of your metric.
+- the file also needs to start with these imports
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+
+# And any other class you might need to redefine your specific metric, depending on whether it's a sample or corpus metric.
+```
+
+- and to end with the following, so that it adds your metric to our metrics list when loaded as a module.
+
+```python
+# Adds the metric to the metric list!
+extend_enum(Metrics, "ifeval_metric", ifeval_metrics)
+if __name__ == "__main__":
+    print("Imported metric")
+```
+
+You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
 
 ### Adding a new task
 To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, or in the community tasks, and **add its dataset** on the hub.
diff --git a/community_tasks/_template.py b/community_tasks/_template.py
@@ -6,6 +6,12 @@
 
 Author:
 """
+import numpy as np
+from aenum import extend_enum
+
+from lighteval.metrics import Metrics
+from lighteval.metrics.metrics import SampleLevelMetric
+from lighteval.metrics.utils import MetricCategory, MetricUseCase
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
@@ -80,6 +86,19 @@ def prompt_fn(line, task_name: str = None):
 SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
 _TASKS = SUBSET_TASKS + [task]
 
+
+## CUSTOM METRIC IF NEEDED
+custom_metric = SampleLevelMetric(
+    metric="my_custom_metric_name",
+    higher_is_better=True,
+    category=MetricCategory.IGNORED,
+    use_case=MetricUseCase.NONE,
+    sample_level_fn=lambda x: x,  # how to compute score for one sample
+    corpus_level_fn=np.mean,  # aggregation
+)
+
+extend_enum(Metrics, "my_custom_metric_name", custom_metric)
+
 ## MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,9 @@ dependencies = [
     "termcolor==2.3.0",
     "pytablewriter",
     "colorama",
+
+    # Extension of metrics
+    "aenum==3.1.15",
     # Base metrics
     "nltk==3.8.1",
     "numpy",
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -1,6 +1,5 @@
-from enum import Enum
-
 import numpy as np
+from aenum import Enum
 
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics