Skip to content

Commit 28728af

Browse files
committed
fix task_list; include_path takes priority
1 parent 140636e commit 28728af

File tree

13 files changed

+427
-211
lines changed

13 files changed

+427
-211
lines changed

lm_eval/tasks/factory.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,16 @@ def _load_full_config(
120120
"metadata": {"config": "unknown"}
121121
} # python task without YAML
122122

123+
# Handle task_list configs - merge base config with per-task overrides
124+
if "task_list" in cfg:
125+
task_list = cfg.pop("task_list")
126+
# Find the entry for this task in task_list
127+
for item in task_list:
128+
if isinstance(item, dict) and item.get("task") == entry.name:
129+
# Merge per-task overrides
130+
cfg = {**cfg, **item}
131+
break
132+
123133
if overrides:
124134
cfg = {**cfg, **overrides}
125135
cfg["metadata"] = (

lm_eval/tasks/manager.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ def __init__(
3333
self._factory = TaskFactory(meta=metadata)
3434

3535
all_paths: list[Path] = []
36-
if include_defaults:
37-
all_paths.append(Path(__file__).parent)
36+
# Process include_path FIRST so user tasks take precedence over defaults
3837
if include_path:
3938
all_paths += [
4039
Path(p)
@@ -44,6 +43,8 @@ def __init__(
4443
else [include_path]
4544
)
4645
]
46+
if include_defaults:
47+
all_paths.append(Path(__file__).parent)
4748

4849
self._index = index.build(all_paths)
4950

tests/test_configs/group.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Group configuration demonstrating task collections
2+
3+
group: test_group
4+
task:
5+
- task: group_task_fs0
6+
dataset_path: json
7+
dataset_kwargs:
8+
data_files:
9+
test: tests/test_configs/test_data.json
10+
output_type: multiple_choice
11+
doc_to_text: "{{question}}"
12+
doc_to_target: "{{choices[answer]}}"
13+
test_split: test
14+
num_fewshot: 0
15+
metric_list:
16+
- metric: acc
17+
aggregation: mean
18+
higher_is_better: true
19+
- task: group_task_fs2
20+
dataset_path: json
21+
dataset_kwargs:
22+
data_files:
23+
test: tests/test_configs/test_data.json
24+
output_type: multiple_choice
25+
doc_to_text: "{{question}}"
26+
doc_to_target: "{{choices[answer]}}"
27+
test_split: test
28+
num_fewshot: 2
29+
metric_list:
30+
- metric: acc
31+
aggregation: mean
32+
higher_is_better: true
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Base configuration for include walkthrough tests
2+
# This will be included by other configs to demonstrate inheritance
3+
4+
task: base_task # This should be overridden by including configs
5+
dataset_path: json
6+
dataset_kwargs:
7+
data_files:
8+
test: tests/test_configs/test_data.json
9+
output_type: multiple_choice
10+
doc_to_text: "{{question}}"
11+
doc_to_target: "{{choices[answer]}}"
12+
test_split: test
13+
num_fewshot: 0 # Default, can be overridden
14+
metric_list:
15+
- metric: acc
16+
aggregation: mean
17+
higher_is_better: true
18+
metadata:
19+
version: 1.0
20+
description: "Base config for include demonstration"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Group with multiple tasks using include inheritance
2+
# Demonstrates tasks sharing the same base config
3+
4+
group: include_group
5+
task:
6+
- include_task_fs0
7+
- include_task_fs1
8+
- include_task_fs5
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Task demonstrating include inheritance
2+
3+
task: include_task_fs0
4+
include: include_base.yaml
5+
num_fewshot: 0
6+
description: "Zero-shot with inheritance"
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Task demonstrating include inheritance
2+
3+
task: include_task_fs1
4+
include: include_base.yaml
5+
num_fewshot: 1
6+
description: "One-shot with inheritance"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Task demonstrating include inheritance with custom metrics
2+
3+
task: include_task_fs5
4+
include: include_base.yaml
5+
num_fewshot: 5
6+
description: "Five-shot with custom metrics"
7+
metric_list:
8+
- metric: acc
9+
aggregation: mean
10+
higher_is_better: true
11+
- metric: acc_norm
12+
aggregation: mean
13+
higher_is_better: true
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Simple task configuration for walkthrough tests
2+
# Demonstrates basic task loading without any special features
3+
4+
task: simple_task
5+
dataset_path: json
6+
dataset_kwargs:
7+
data_files:
8+
test: tests/test_configs/test_data.json
9+
output_type: multiple_choice
10+
doc_to_text: "{{question}}"
11+
doc_to_target: "{{choices[answer]}}"
12+
test_split: test
13+
num_fewshot: 1
14+
metric_list:
15+
- metric: acc
16+
aggregation: mean
17+
higher_is_better: true
18+
metadata:
19+
version: 1.0
20+
description: "Simple task for basic walkthrough"

tests/test_configs/task_list.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Task list configuration for code walkthrough tests
2+
# This demonstrates the task_list feature with shared config and task-specific overrides
3+
4+
dataset_path: json
5+
dataset_kwargs:
6+
data_files:
7+
test: tests/test_configs/test_data.json
8+
output_type: multiple_choice
9+
doc_to_text: "{{question}}"
10+
doc_to_target: "{{choices[answer]}}"
11+
test_split: test
12+
metric_list:
13+
- metric: acc
14+
aggregation: mean
15+
higher_is_better: true
16+
metadata:
17+
version: 1.0
18+
description: "Task list walkthrough example"
19+
20+
task_list:
21+
- task: task_list_fs0
22+
num_fewshot: 0
23+
description: "Zero-shot variant"
24+
- task: task_list_fs1
25+
num_fewshot: 1
26+
description: "One-shot variant"
27+
- task: task_list_fs3
28+
num_fewshot: 3
29+
description: "Three-shot variant"
30+
metric_list:
31+
- metric: acc
32+
aggregation: mean
33+
higher_is_better: true
34+
- metric: acc_norm
35+
aggregation: mean
36+
higher_is_better: true

0 commit comments

Comments
 (0)