Feat: Add E2LMC team morai submission from Neurips 2026 competition (#3443)

younesbelkada · beacnascimento · CaioRhoden · web-flow · commit cdb42533c74f · 2025-12-03T12:40:54.000+05:00
* Add Morai submission

Co-authored-by: Beatriz &lt;beacnascimento@users.noreply.github.com&gt;
Co-authored-by: Caio Emanuel Rhoden &lt;CaioRhoden@users.noreply.github.com&gt;
Co-authored-by: Daniel Gardin &lt;DanielGardin@users.noreply.github.com&gt;
Co-authored-by: Giovani Valdrighi &lt;GiovaniValdrighi@users.noreply.github.com&gt;

* add README

* move to `e2lmc/`

---------

Co-authored-by: Beatriz &lt;beacnascimento@users.noreply.github.com&gt;
Co-authored-by: Caio Emanuel Rhoden &lt;CaioRhoden@users.noreply.github.com&gt;
Co-authored-by: Daniel Gardin &lt;DanielGardin@users.noreply.github.com&gt;
Co-authored-by: Giovani Valdrighi &lt;GiovaniValdrighi@users.noreply.github.com&gt;
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
@@ -265,6 +265,16 @@ def perplexity_fn(items):  # This is a passthrough function
     return items
 
 
+@register_metric(
+    metric="likelihood",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def likelihood_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
     metric="word_perplexity",
     higher_is_better=False,
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
@@ -1691,6 +1691,7 @@ def process_results(self, doc, results):
                     if "brier_score" in use_metric
                     else {}
                 ),
+                **({"likelihood": (gold, lls)} if "likelihood" in use_metric else {}),
             }
 
             if "acc_mutual_info" in use_metric:
diff --git a/lm_eval/tasks/e2lmc/mmlu_early_training/README.md b/lm_eval/tasks/e2lmc/mmlu_early_training/README.md
@@ -0,0 +1,23 @@
+# MMLU Early Training
+
+This is a update of the MMLU benchmark (Hendrycks et al.) to evaluate models at early training stages. MMLU consists of multiple-choice questions from various branches of knowledge, such as humanities, social sciences, hard sciences.  The dataset was created by selecting the 50% "easiest questions", that is, question in which there is a learning signal at the early training stages. Furthermore, MMLU is an multiple-choice dataset, and in this task, choices are not present in the prompt, and the target is the complete choice text, not only the choice letter. To further ensure signal at early training stages, the metric is defined as the difference of the log-likelihood of the correct choice and the average log-likelihood among incorrect choices.
+
+### Groups, Tags, and Tasks
+
+#### Groups
+- Not part of a group yet.
+#### Tasks
+- `mmlu_early_training`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/e2lmc/mmlu_early_training/custom_metrics.py b/lm_eval/tasks/e2lmc/mmlu_early_training/custom_metrics.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+
+def loglikelihood_diff(items):
+    diffs = []
+    for item in items:
+        target, lls = item
+        target_ll = lls[target]
+        others_ll = [ll for i, ll in enumerate(lls) if i != target]
+        mean_others_ll = np.mean(others_ll)
+        diff = target_ll - mean_others_ll
+        diffs.append(diff)
+
+    return np.mean(diffs)
diff --git a/lm_eval/tasks/e2lmc/mmlu_early_training/mmlu_early_training.yaml b/lm_eval/tasks/e2lmc/mmlu_early_training/mmlu_early_training.yaml
@@ -0,0 +1,20 @@
+task: mmlu_early_training
+dataset_path: giovanivaldrighi/mmlu
+test_split: test
+fewshot_split: dev
+num_fewshot: 5
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+"description": "The following are multiple choice questions (with answers) about {{subject}}.\n\n"
+doc_to_text: "Question: {{question.strip()}}\nAnswer:"
+doc_to_choice: "{{choices}}"
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: likelihood
+    aggregation: !function custom_metrics.loglikelihood_diff
+    high_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true

Original file line number	Diff line number	Diff line change
`@@ -1691,6 +1691,7 @@ def process_results(self, doc, results):`
`1691`	`1691`	`if "brier_score" in use_metric`
`1692`	`1692`	`else {}`
`1693`	`1693`	`),`
	`1694`	`+ **({"likelihood": (gold, lls)} if "likelihood" in use_metric else {}),`
`1694`	`1695`	`}`
`1695`	`1696`
`1696`	`1697`	`if "acc_mutual_info" in use_metric:`