fix task_list; include_path takes priority

baberabb · baberabb · commit 28728afadf38 · 2025-12-09T16:09:30.000+05:00
diff --git a/lm_eval/tasks/factory.py b/lm_eval/tasks/factory.py
@@ -120,6 +120,16 @@ def _load_full_config(
                 "metadata": {"config": "unknown"}
             }  # python task without YAML
 
+        # Handle task_list configs - merge base config with per-task overrides
+        if "task_list" in cfg:
+            task_list = cfg.pop("task_list")
+            # Find the entry for this task in task_list
+            for item in task_list:
+                if isinstance(item, dict) and item.get("task") == entry.name:
+                    # Merge per-task overrides
+                    cfg = {**cfg, **item}
+                    break
+
         if overrides:
             cfg = {**cfg, **overrides}
         cfg["metadata"] = (
diff --git a/lm_eval/tasks/manager.py b/lm_eval/tasks/manager.py
@@ -33,8 +33,7 @@ def __init__(
         self._factory = TaskFactory(meta=metadata)
 
         all_paths: list[Path] = []
-        if include_defaults:
-            all_paths.append(Path(__file__).parent)
+        # Process include_path FIRST so user tasks take precedence over defaults
         if include_path:
             all_paths += [
                 Path(p)
@@ -44,6 +43,8 @@ def __init__(
                     else [include_path]
                 )
             ]
+        if include_defaults:
+            all_paths.append(Path(__file__).parent)
 
         self._index = index.build(all_paths)
 
diff --git a/tests/test_configs/group.yaml b/tests/test_configs/group.yaml
@@ -0,0 +1,32 @@
+# Group configuration demonstrating task collections
+
+group: test_group
+task:
+  - task: group_task_fs0
+    dataset_path: json
+    dataset_kwargs:
+      data_files:
+        test: tests/test_configs/test_data.json
+    output_type: multiple_choice
+    doc_to_text: "{{question}}"
+    doc_to_target: "{{choices[answer]}}"
+    test_split: test
+    num_fewshot: 0
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
+  - task: group_task_fs2
+    dataset_path: json
+    dataset_kwargs:
+      data_files:
+        test: tests/test_configs/test_data.json
+    output_type: multiple_choice
+    doc_to_text: "{{question}}"
+    doc_to_target: "{{choices[answer]}}"
+    test_split: test
+    num_fewshot: 2
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
diff --git a/tests/test_configs/include_base.yaml b/tests/test_configs/include_base.yaml
@@ -0,0 +1,20 @@
+# Base configuration for include walkthrough tests
+# This will be included by other configs to demonstrate inheritance
+
+task: base_task # This should be overridden by including configs
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: tests/test_configs/test_data.json
+output_type: multiple_choice
+doc_to_text: "{{question}}"
+doc_to_target: "{{choices[answer]}}"
+test_split: test
+num_fewshot: 0 # Default, can be overridden
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+  description: "Base config for include demonstration"
diff --git a/tests/test_configs/include_group.yaml b/tests/test_configs/include_group.yaml
@@ -0,0 +1,8 @@
+# Group with multiple tasks using include inheritance
+# Demonstrates tasks sharing the same base config
+
+group: include_group
+task:
+  - include_task_fs0
+  - include_task_fs1
+  - include_task_fs5
diff --git a/tests/test_configs/include_task_fs0.yaml b/tests/test_configs/include_task_fs0.yaml
@@ -0,0 +1,6 @@
+# Task demonstrating include inheritance
+
+task: include_task_fs0
+include: include_base.yaml
+num_fewshot: 0
+description: "Zero-shot with inheritance"
diff --git a/tests/test_configs/include_task_fs1.yaml b/tests/test_configs/include_task_fs1.yaml
@@ -0,0 +1,6 @@
+# Task demonstrating include inheritance
+
+task: include_task_fs1
+include: include_base.yaml
+num_fewshot: 1
+description: "One-shot with inheritance"
diff --git a/tests/test_configs/include_task_fs5.yaml b/tests/test_configs/include_task_fs5.yaml
@@ -0,0 +1,13 @@
+# Task demonstrating include inheritance with custom metrics
+
+task: include_task_fs5
+include: include_base.yaml
+num_fewshot: 5
+description: "Five-shot with custom metrics"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
diff --git a/tests/test_configs/simple_task.yaml b/tests/test_configs/simple_task.yaml
@@ -0,0 +1,20 @@
+# Simple task configuration for walkthrough tests
+# Demonstrates basic task loading without any special features
+
+task: simple_task
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: tests/test_configs/test_data.json
+output_type: multiple_choice
+doc_to_text: "{{question}}"
+doc_to_target: "{{choices[answer]}}"
+test_split: test
+num_fewshot: 1
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+  description: "Simple task for basic walkthrough"
diff --git a/tests/test_configs/task_list.yaml b/tests/test_configs/task_list.yaml
@@ -0,0 +1,36 @@
+# Task list configuration for code walkthrough tests
+# This demonstrates the task_list feature with shared config and task-specific overrides
+
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: tests/test_configs/test_data.json
+output_type: multiple_choice
+doc_to_text: "{{question}}"
+doc_to_target: "{{choices[answer]}}"
+test_split: test
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+  description: "Task list walkthrough example"
+
+task_list:
+  - task: task_list_fs0
+    num_fewshot: 0
+    description: "Zero-shot variant"
+  - task: task_list_fs1
+    num_fewshot: 1
+    description: "One-shot variant"
+  - task: task_list_fs3
+    num_fewshot: 3
+    description: "Three-shot variant"
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
+      - metric: acc_norm
+        aggregation: mean
+        higher_is_better: true
diff --git a/tests/test_configs/test_data.json b/tests/test_configs/test_data.json
@@ -0,0 +1,15 @@
+[
+  { "question": "What is 2+2?", "choices": ["1", "2", "3", "4"], "answer": 3 },
+  { "question": "What is 3+3?", "choices": ["4", "5", "6", "7"], "answer": 2 },
+  { "question": "What is 4+4?", "choices": ["6", "7", "8", "9"], "answer": 2 },
+  {
+    "question": "What is 5+5?",
+    "choices": ["8", "9", "10", "11"],
+    "answer": 2
+  },
+  {
+    "question": "What is 6+6?",
+    "choices": ["10", "11", "12", "13"],
+    "answer": 2
+  }
+]
diff --git a/tests/test_include_path.py b/tests/test_include_path.py
diff --git a/tests/test_task_manager.py b/tests/test_task_manager.py