Feat: Add team Noor submission from NeurIPS E2LM Competition (#3444)

younesbelkada · ezzakri-anas · Dahbani1 · web-flow · commit a6fd8e6c8d15 · 2025-12-06T18:31:58.000+05:00
* Add Noor submission

Co-authored-by: Ezzakri Anas &lt;ezzakri-anas@users.noreply.github.com&gt;
Co-authored-by: Dahbani1 &lt;Dahbani1@users.noreply.github.com&gt;

* add readme;

* move to `e2lmc`

* remove `.yaml` extension from file

* fix CI

---------

Co-authored-by: Ezzakri Anas &lt;ezzakri-anas@users.noreply.github.com&gt;
Co-authored-by: Dahbani1 &lt;Dahbani1@users.noreply.github.com&gt;
diff --git a/lm_eval/tasks/e2lmc/noor/README.md b/lm_eval/tasks/e2lmc/noor/README.md
@@ -0,0 +1,41 @@
+# Noor – MMLU-var Filtered Benchmark
+
+**Authors:** Mohammed Dahbani & Anas Ezzakri (IMT Atlantique)
+
+## Description
+
+This benchmark is a **filtered and improved version of MMLU/MMLU-var**, designed to provide a **stable, monotonic, and informative evaluation signal during the early stages of LLM training**.
+
+Standard MMLU often becomes noisy or non-discriminative for models that have seen only limited training. Our benchmark keeps only the questions that reliably reflect **true learning progress**.
+
+## Methodology
+
+### 1. Scientific Compliance Filtering
+- Automatically retain all *hard-science* subjects (Math, Physics, Chemistry, etc.).
+- For more variable subjects (Biology, Medicine, Humanities), apply an **LLM-as-a-Judge** process:
+  - Each question is evaluated **5 times**.
+  - A question is retained only if it receives **5/5 “Accept”** decisions.
+
+This ensures clarity, consistency, and the removal of ambiguous items.
+
+### 2. Signal Quality Filtering
+For every question, we compute its **Confidence Margin** across all training checkpoints and fit a **linear regression**:
+
+- Only questions with a **positive slope** are retained.
+- This ensures that each item produces a **smooth and monotonic learning trend**.
+
+The combination of both filters produces a benchmark that is cleaner, more stable, and much more sensitive to early-stage learning dynamics.
+
+## Task Structure
+
+The main group:
+- `noor`
+
+This includes all subjects that pass both filtering stages.
+
+Each subject is also available as an independent task:
+- Example: `noor_abstract_algebra`, `noor_college_physics`, `noor_machine_learning`, etc.
+
+## Purpose
+
+To provide a **reliable and low-noise evaluation signal** for early-stage LLMs, where traditional benchmarks usually fail to capture meaningful progress.
diff --git a/lm_eval/tasks/e2lmc/noor/_generate_configs.py b/lm_eval/tasks/e2lmc/noor/_generate_configs.py
@@ -0,0 +1,159 @@
+# noqa
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+SUBJECTS = {
+    "abstract_algebra": "stem",
+    "anatomy": "stem",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    # "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    # "moral_disputes": "humanities",
+    # "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    # "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    # "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="noor")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    parser.add_argument("--group_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    ALL_CATEGORIES = []
+    for subject, category in tqdm(SUBJECTS.items()):
+        if category not in ALL_CATEGORIES:
+            ALL_CATEGORIES.append(category)
+
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject]
+        else:
+            description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "tag": f"noor_{args.task_prefix}_{category}"
+            if args.task_prefix != ""
+            else f"noor_{category}",
+            "task": f"noor_{args.task_prefix}_{subject}"
+            if args.task_prefix != ""
+            else f"noor_{subject}",
+            "task_alias": subject.replace("_", " "),
+            "dataset_name": subject,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                allow_unicode=True,
+                default_style='"',
+            )
+
+    if args.task_prefix != "":
+        noor_subcategories = [
+            f"noor_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
+        ]
+    else:
+        noor_subcategories = [f"noor_{category}" for category in ALL_CATEGORIES]
+
+    if args.group_prefix != "":
+        file_save_path = args.group_prefix + ".yaml"
+    else:
+        file_save_path = args.save_prefix_path + ".yaml"
+
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
+    with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+        yaml.dump(
+            {
+                "group": f"noor_{args.task_prefix}"
+                if args.task_prefix != ""
+                else "noor",
+                "task": noor_subcategories,
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
diff --git a/lm_eval/tasks/e2lmc/noor/_noor.yaml b/lm_eval/tasks/e2lmc/noor/_noor.yaml
@@ -0,0 +1,14 @@
+group: noor
+group_alias: noor
+task:
+  - group: stem
+    task:
+      - noor_stem
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/e2lmc/noor/_noor_stem.yaml b/lm_eval/tasks/e2lmc/noor/_noor_stem.yaml
@@ -0,0 +1,9 @@
+group: noor_stem
+group_alias: stem
+task:
+  - noor_stem_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/e2lmc/noor/_noor_template b/lm_eval/tasks/e2lmc/noor/_noor_template
@@ -0,0 +1,20 @@
+# task: noor
+dataset_path: mdahba/noor_final2
+output_type: multiple_choice  # or: generate_until, loglikelihood, loglikelihood_rolling
+# training_split: train
+validation_split: validation
+test_split: test
+fewshot_split: dev
+num_fewshot: 5
+fewshot_config:
+  sampler: first_n
+doc_to_text: "Question: {{question.strip()}}\nAnswer:"
+doc_to_choice: "{{choices}}"
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/lm_eval/tasks/e2lmc/noor/noor_abstract_algebra.yaml b/lm_eval/tasks/e2lmc/noor/noor_abstract_algebra.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_abstract_algebra"
+"task_alias": "abstract algebra"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_college_computer_science.yaml b/lm_eval/tasks/e2lmc/noor/noor_college_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_college_computer_science"
+"task_alias": "college computer science"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_college_mathematics.yaml b/lm_eval/tasks/e2lmc/noor/noor_college_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_college_mathematics"
+"task_alias": "college mathematics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_college_physics.yaml b/lm_eval/tasks/e2lmc/noor/noor_college_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_college_physics"
+"task_alias": "college physics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_conceptual_physics.yaml b/lm_eval/tasks/e2lmc/noor/noor_conceptual_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_conceptual_physics"
+"task_alias": "conceptual physics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_electrical_engineering.yaml b/lm_eval/tasks/e2lmc/noor/noor_electrical_engineering.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_electrical_engineering"
+"task_alias": "electrical engineering"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_elementary_mathematics.yaml b/lm_eval/tasks/e2lmc/noor/noor_elementary_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "elementary_mathematics"
+"description": "The following are multiple choice questions (with answers) about elementary\
+  \ mathematics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_elementary_mathematics"
+"task_alias": "elementary mathematics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_high_school_computer_science.yaml b/lm_eval/tasks/e2lmc/noor/noor_high_school_computer_science.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "high_school_computer_science"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school computer science.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_high_school_computer_science"
+"task_alias": "high school computer science"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_high_school_mathematics.yaml b/lm_eval/tasks/e2lmc/noor/noor_high_school_mathematics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "high_school_mathematics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school mathematics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_high_school_mathematics"
+"task_alias": "high school mathematics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_high_school_physics.yaml b/lm_eval/tasks/e2lmc/noor/noor_high_school_physics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "high_school_physics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school physics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_high_school_physics"
+"task_alias": "high school physics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_high_school_statistics.yaml b/lm_eval/tasks/e2lmc/noor/noor_high_school_statistics.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "high_school_statistics"
+"description": "The following are multiple choice questions (with answers) about high\
+  \ school statistics.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_high_school_statistics"
+"task_alias": "high school statistics"
diff --git a/lm_eval/tasks/e2lmc/noor/noor_machine_learning.yaml b/lm_eval/tasks/e2lmc/noor/noor_machine_learning.yaml
@@ -0,0 +1,7 @@
+"dataset_name": "machine_learning"
+"description": "The following are multiple choice questions (with answers) about machine\
+  \ learning.\n\n"
+"include": "_noor_template"
+"tag": "noor_stem_tasks"
+"task": "noor_machine_learning"
+"task_alias": "machine learning"