Skip to content

Commit cff8925

Browse files
committed
update task grouping
1 parent 6c23bb7 commit cff8925

File tree

2 files changed

+116
-42
lines changed

2 files changed

+116
-42
lines changed

configs/all_tasks.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,31 @@
11
{
22
"infer": {
33
"pretrain": [
4+
"arc", "arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh",
5+
"hellaswag", "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi",
46
"winogrande", "xcopa_et", "xcopa_ht", "xcopa_qu", "xcopa_id", "xcopa_it", "xcopa_sw", "xcopa_ta", "xcopa_th", "xcopa_tr", "xcopa_vi", "xcopa_zh", "xnli_ar", "xnli_bg", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_sw", "xnli_th", "xnli_tr", "xnli_ur", "xnli_vi", "xnli_zh", "xwinograd_en", "xwinograd_fr", "xwinograd_pt", "xwinograd_ru", "xwinograd_zh"
57
],
6-
"knowledge_dev": [
8+
"factual_dev": [
79
"mmlu", "global_mmlu_ar", "global_mmlu_bn", "global_mmlu_de", "global_mmlu_es", "global_mmlu_fr", "global_mmlu_hi", "global_mmlu_id", "global_mmlu_it", "global_mmlu_ja", "global_mmlu_ko", "global_mmlu_pt", "global_mmlu_sw", "global_mmlu_yo", "global_mmlu_zh",
810
"include_base_44_albanian", "include_base_44_arabic", "include_base_44_armenian", "include_base_44_azerbaijani", "include_base_44_basque", "include_base_44_belarusian", "include_base_44_bengali", "include_base_44_bulgarian", "include_base_44_chinese", "include_base_44_croatian", "include_base_44_finnish", "include_base_44_french", "include_base_44_georgian", "include_base_44_german", "include_base_44_hebrew", "include_base_44_hindi", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_kazakh", "include_base_44_korean", "include_base_44_lithuanian", "include_base_44_malay", "include_base_44_malayalam", "include_base_44_nepali", "include_base_44_persian", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_serbian", "include_base_44_spanish", "include_base_44_tagalog", "include_base_44_tamil", "include_base_44_telugu", "include_base_44_turkish", "include_base_44_ukrainian", "include_base_44_uzbek", "include_base_44_vietnamese", "mmlu",
9-
"blend_algeria", "blend_assam", "blend_azerbaijan", "blend_china", "blend_ethiopia", "blend_greece", "blend_indonesia", "blend_iran", "blend_mexico", "blend_north_korea", "blend_northern_nigeria", "blend_south_korea", "blend_spain", "blend_uk", "blend_us", "blend_west_java",
10-
"cultural_bench_easy_argentina", "cultural_bench_easy_australia", "cultural_bench_easy_bangladesh", "cultural_bench_easy_brazil", "cultural_bench_easy_canada", "cultural_bench_easy_chile", "cultural_bench_easy_china", "cultural_bench_easy_czech_republic", "cultural_bench_easy_egypt", "cultural_bench_easy_france", "cultural_bench_easy_germany", "cultural_bench_easy_hong_kong", "cultural_bench_easy_india", "cultural_bench_easy_indonesia", "cultural_bench_easy_iran", "cultural_bench_easy_israel", "cultural_bench_easy_italy", "cultural_bench_easy_japan", "cultural_bench_easy_lebanon", "cultural_bench_easy_malaysia", "cultural_bench_easy_mexico", "cultural_bench_easy_morocco", "cultural_bench_easy_nepal", "cultural_bench_easy_netherlands", "cultural_bench_easy_new_zealand", "cultural_bench_easy_nigeria", "cultural_bench_easy_pakistan", "cultural_bench_easy_peru", "cultural_bench_easy_philippines", "cultural_bench_easy_poland", "cultural_bench_easy_romania", "cultural_bench_easy_russia", "cultural_bench_easy_saudi_arabia", "cultural_bench_easy_singapore", "cultural_bench_easy_south_africa", "cultural_bench_easy_south_korea", "cultural_bench_easy_spain", "cultural_bench_easy_taiwan", "cultural_bench_easy_thailand", "cultural_bench_easy_turkey", "cultural_bench_easy_ukraine", "cultural_bench_easy_united_kingdom", "cultural_bench_easy_united_states", "cultural_bench_easy_vietnam", "cultural_bench_easy_zimbabwe",
11-
"cultural_bench_hard_argentina", "cultural_bench_hard_australia", "cultural_bench_hard_bangladesh", "cultural_bench_hard_brazil", "cultural_bench_hard_canada", "cultural_bench_hard_chile", "cultural_bench_hard_china", "cultural_bench_hard_czech_republic", "cultural_bench_hard_egypt", "cultural_bench_hard_france", "cultural_bench_hard_germany", "cultural_bench_hard_hong_kong", "cultural_bench_hard_india", "cultural_bench_hard_indonesia", "cultural_bench_hard_iran", "cultural_bench_hard_israel", "cultural_bench_hard_italy", "cultural_bench_hard_japan", "cultural_bench_hard_lebanon", "cultural_bench_hard_malaysia", "cultural_bench_hard_mexico", "cultural_bench_hard_morocco", "cultural_bench_hard_nepal", "cultural_bench_hard_netherlands", "cultural_bench_hard_new_zealand", "cultural_bench_hard_nigeria", "cultural_bench_hard_pakistan", "cultural_bench_hard_peru", "cultural_bench_hard_philippines", "cultural_bench_hard_poland", "cultural_bench_hard_romania", "cultural_bench_hard_russia", "cultural_bench_hard_saudi_arabia", "cultural_bench_hard_singapore", "cultural_bench_hard_south_africa", "cultural_bench_hard_south_korea", "cultural_bench_hard_spain", "cultural_bench_hard_taiwan", "cultural_bench_hard_thailand", "cultural_bench_hard_turkey", "cultural_bench_hard_ukraine", "cultural_bench_hard_united_kingdom", "cultural_bench_hard_united_states", "cultural_bench_hard_vietnam", "cultural_bench_hard_zimbabwe",
11+
"blend",
12+
"cultural_bench",
1213
"truthfulqa",
1314
"truthfulqa_ar", "truthfulqa_bn", "truthfulqa_ca", "truthfulqa_da", "truthfulqa_de", "truthfulqa_es", "truthfulqa_eu", "truthfulqa_fr", "truthfulqa_gu", "truthfulqa_hi", "truthfulqa_hr", "truthfulqa_hu", "truthfulqa_hy", "truthfulqa_id", "truthfulqa_it", "truthfulqa_kn", "truthfulqa_ml", "truthfulqa_mr", "truthfulqa_ne", "truthfulqa_nl", "truthfulqa_pt", "truthfulqa_ro", "truthfulqa_ru", "truthfulqa_sk", "truthfulqa_sr", "truthfulqa_sv", "truthfulqa_ta", "truthfulqa_te", "truthfulqa_uk", "truthfulqa_vi", "truthfulqa_zh"
1415
],
15-
"knowledge_test": [
16+
"factual_test": [
1617
"mmlu_pro", "agieval",
1718
"mmlu_prox_ar", "mmlu_prox_bn", "mmlu_prox_de", "mmlu_prox_en", "mmlu_prox_es", "mmlu_prox_fr", "mmlu_prox_hi", "mmlu_prox_ja", "mmlu_prox_ko", "mmlu_prox_pt", "mmlu_prox_sw", "mmlu_prox_th", "mmlu_prox_zh"
1819
],
1920
"sft_dev": [
20-
"ifeval",
21-
"hellaswag", "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi"
21+
"ifeval"
2222
],
2323
"alignment_dev": [
2424
"realtoxicityprompts",
2525
"polyglotoxicityprompts_full_arabic", "polyglotoxicityprompts_full_czech", "polyglotoxicityprompts_full_german", "polyglotoxicityprompts_full_english", "polyglotoxicityprompts_full_spanish", "polyglotoxicityprompts_full_french", "polyglotoxicityprompts_full_hindi", "polyglotoxicityprompts_full_indonesian", "polyglotoxicityprompts_full_italian", "polyglotoxicityprompts_full_japanese", "polyglotoxicityprompts_full_korean", "polyglotoxicityprompts_full_dutch", "polyglotoxicityprompts_full_polish", "polyglotoxicityprompts_full_portuguese", "polyglotoxicityprompts_full_russian", "polyglotoxicityprompts_full_swedish", "polyglotoxicityprompts_full_chinese",
2626
"multijail_en", "multijail_ar", "multijail_ko", "multijail_th", "multijail_bn", "multijail_sw", "multijail_jv", "multijail_it", "multijail_vi",
2727
"aya_redteaming_arabic", "aya_redteaming_english", "aya_redteaming_filipino", "aya_redteaming_french", "aya_redteaming_hindi", "aya_redteaming_russian", "aya_redteaming_serbian", "aya_redteaming_spanish",
28+
"harmbench",
2829
"toxigen",
2930
"bbq"
3031
],
@@ -38,11 +39,11 @@
3839
"mbpp"
3940
],
4041
"reasoning_test": [
41-
"arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh", "arc",
4242
"gpqa_main_cot_zeroshot",
4343
"gpqa_diamond_cot_zeroshot",
4444
"gpqa_extended_cot_zeroshot",
45-
"gsm8k_platinum_cot_zeroshot"
45+
"gsm8k_platinum_cot_zeroshot",
46+
"longbench_hotpotqa"
4647
]
4748
},
4849
"other": [

src/evals/tasks.py

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,50 +20,123 @@ class Dimension(enum.StrEnum):
2020
general_abilities = enum.auto()
2121
factual_agnostic = enum.auto()
2222
factual_regional = enum.auto()
23-
safety = enum.auto()
24-
reasoning = enum.auto()
25-
math = enum.auto()
26-
coding = enum.auto()
27-
if_and_chat = enum.auto()
28-
bias = enum.auto()
23+
reasoning_multihop = enum.auto()
24+
reasoning_planning = enum.auto()
25+
reasoning_math = enum.auto()
26+
reasoning_code = enum.auto()
27+
reasoning_if_and_chat = enum.auto()
28+
reasoning_causal = enum.auto()
29+
redteaming_biases_and_hate_speech = enum.auto()
30+
redteaming_alignment = enum.auto()
31+
redteaming_robustness = enum.auto()
2932

3033
@classmethod
3134
def get(cls, name: str) -> Dimension:
32-
general = ["piqa", "winogrande", "xwinograd", "xnli", "copa", "xcopa"]
33-
agnostic = ["mmlu", "global_mmlu", "truthfulqa"]
34-
regional = ["include", "switzerland_qa", "cultural_bench", "blend"]
35-
safety = ["multijail", "aya_redteaming", "realtoxicityprompts", "polyglotoxicityprompts", "toxigen"]
36-
reasoning = ["bbh", "drop", "arc", "ai2_arc", "gpqa"]
37-
math = ["hendrycks_math", "gsm8k", "math"]
38-
coding = ["humaneval", "mbpp"]
39-
if_and_chat = ["ifeval", "hellaswag"]
40-
bias = ["bbq"]
41-
42-
if any(name.startswith(group) for group in general):
35+
general_abilities = [
36+
"piqa",
37+
"winogrande",
38+
"xwinograd",
39+
"xnli",
40+
"copa",
41+
"xcopa",
42+
"hellaswag",
43+
"m_hellaswag",
44+
"arc",
45+
"m_arc"
46+
]
47+
factual_agnostic = [
48+
"mmlu",
49+
"global_mmlu",
50+
"truthfulqa",
51+
]
52+
factual_regional = [
53+
"include",
54+
"switzerland_qa",
55+
"cultural_bench",
56+
"blend",
57+
"calmqa"
58+
]
59+
reasoning_multihop = [
60+
"agieval",
61+
"drop",
62+
"mlogiqa",
63+
"bbh",
64+
"arc",
65+
"m_arc",
66+
"hotpotqa"
67+
]
68+
reasoning_planning = [
69+
"acpbench"
70+
]
71+
reasoning_math = [
72+
"hendrycks_math",
73+
"gsm8k",
74+
"aime",
75+
"mathqa",
76+
"mgsm",
77+
"polymath"
78+
"gpqa"
79+
]
80+
reasoning_code = [
81+
"humaneval",
82+
"mbpp",
83+
"bigcodebench"
84+
]
85+
reasoning_if_and_chat = [
86+
"ifeval",
87+
"multi_ifeval",
88+
]
89+
reasoning_causal = [
90+
"crab"
91+
]
92+
redteaming_biases_and_hate_speech = [
93+
"bbq",
94+
"toxigen",
95+
"realtoxicityprompts",
96+
"polyglotoxicityprompts",
97+
"aya_redteaming"
98+
]
99+
redteaming_alignment = [
100+
"reward_bench",
101+
"m_arena_hard"
102+
]
103+
redteaming_robustness = [
104+
"harmbench",
105+
"multijail"
106+
]
107+
108+
109+
if any(name.startswith(group) for group in general_abilities):
43110
return Dimension.general_abilities
44-
if any(name.startswith(group) for group in agnostic):
111+
if any(name.startswith(group) for group in factual_agnostic):
45112
return Dimension.factual_agnostic
46-
if any(name.startswith(group) for group in regional):
113+
if any(name.startswith(group) for group in factual_regional):
47114
return Dimension.factual_regional
48-
if any(name.startswith(group) for group in safety):
49-
return Dimension.safety
50-
if any(name.startswith(group) for group in reasoning):
51-
return Dimension.reasoning
52-
if any(name.startswith(group) for group in math):
53-
return Dimension.math
54-
if any(name.startswith(group) for group in coding):
55-
return Dimension.coding
56-
if any(name.startswith(group) for group in if_and_chat):
57-
return Dimension.if_and_chat
58-
if any(name.startswith(group) for group in bias):
59-
return Dimension.bias
115+
if any(name.startswith(group) for group in reasoning_multihop):
116+
return Dimension.reasoning_multihop
117+
if any(name.startswith(group) for group in reasoning_planning):
118+
return Dimension.reasoning_planning
119+
if any(name.startswith(group) for group in reasoning_math):
120+
return Dimension.reasoning_math
121+
if any(name.startswith(group) for group in reasoning_code):
122+
return Dimension.reasoning_code
123+
if any(name.startswith(group) for group in reasoning_if_and_chat):
124+
return Dimension.reasoning_if_and_chat
125+
if any(name.startswith(group) for group in reasoning_causal):
126+
return Dimension.reasoning_causal
127+
if any(name.startswith(group) for group in redteaming_biases_and_hate_speech):
128+
return Dimension.redteaming_biases_and_hate_speech
129+
if any(name.startswith(group) for group in redteaming_alignment):
130+
return Dimension.redteaming_alignment
131+
if any(name.startswith(group) for group in redteaming_robustness):
132+
return Dimension.redteaming_robustness
60133
raise ValueError(f"Could not infer dimension for task {name}")
61134

62135

63136
class TaskKind(enum.StrEnum):
64137
pretrain = enum.auto()
65-
knowledge_dev = enum.auto()
66-
knowledge_test = enum.auto()
138+
factual_dev = enum.auto()
139+
factual_test = enum.auto()
67140
sft_dev = enum.auto()
68141
alignment_dev = enum.auto()
69142
reasoning_dev = enum.auto()

0 commit comments

Comments
 (0)