|
1 | 1 | { |
2 | 2 | "infer": { |
3 | 3 | "pretrain": [ |
| 4 | + "arc", "arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh", |
| 5 | + "hellaswag", "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi", |
4 | 6 | "winogrande", "xcopa_et", "xcopa_ht", "xcopa_qu", "xcopa_id", "xcopa_it", "xcopa_sw", "xcopa_ta", "xcopa_th", "xcopa_tr", "xcopa_vi", "xcopa_zh", "xnli_ar", "xnli_bg", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_sw", "xnli_th", "xnli_tr", "xnli_ur", "xnli_vi", "xnli_zh", "xwinograd_en", "xwinograd_fr", "xwinograd_pt", "xwinograd_ru", "xwinograd_zh" |
5 | 7 | ], |
6 | | - "knowledge_dev": [ |
| 8 | + "factual_dev": [ |
7 | 9 | "mmlu", "global_mmlu_ar", "global_mmlu_bn", "global_mmlu_de", "global_mmlu_es", "global_mmlu_fr", "global_mmlu_hi", "global_mmlu_id", "global_mmlu_it", "global_mmlu_ja", "global_mmlu_ko", "global_mmlu_pt", "global_mmlu_sw", "global_mmlu_yo", "global_mmlu_zh", |
8 | 10 | "include_base_44_albanian", "include_base_44_arabic", "include_base_44_armenian", "include_base_44_azerbaijani", "include_base_44_basque", "include_base_44_belarusian", "include_base_44_bengali", "include_base_44_bulgarian", "include_base_44_chinese", "include_base_44_croatian", "include_base_44_finnish", "include_base_44_french", "include_base_44_georgian", "include_base_44_german", "include_base_44_hebrew", "include_base_44_hindi", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_kazakh", "include_base_44_korean", "include_base_44_lithuanian", "include_base_44_malay", "include_base_44_malayalam", "include_base_44_nepali", "include_base_44_persian", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_serbian", "include_base_44_spanish", "include_base_44_tagalog", "include_base_44_tamil", "include_base_44_telugu", "include_base_44_turkish", "include_base_44_ukrainian", "include_base_44_uzbek", "include_base_44_vietnamese", "mmlu", |
9 | | - "blend_algeria", "blend_assam", "blend_azerbaijan", "blend_china", "blend_ethiopia", "blend_greece", "blend_indonesia", "blend_iran", "blend_mexico", "blend_north_korea", "blend_northern_nigeria", "blend_south_korea", "blend_spain", "blend_uk", "blend_us", "blend_west_java", |
10 | | - "cultural_bench_easy_argentina", "cultural_bench_easy_australia", "cultural_bench_easy_bangladesh", "cultural_bench_easy_brazil", "cultural_bench_easy_canada", "cultural_bench_easy_chile", "cultural_bench_easy_china", "cultural_bench_easy_czech_republic", "cultural_bench_easy_egypt", "cultural_bench_easy_france", "cultural_bench_easy_germany", "cultural_bench_easy_hong_kong", "cultural_bench_easy_india", "cultural_bench_easy_indonesia", "cultural_bench_easy_iran", "cultural_bench_easy_israel", "cultural_bench_easy_italy", "cultural_bench_easy_japan", "cultural_bench_easy_lebanon", "cultural_bench_easy_malaysia", "cultural_bench_easy_mexico", "cultural_bench_easy_morocco", "cultural_bench_easy_nepal", "cultural_bench_easy_netherlands", "cultural_bench_easy_new_zealand", "cultural_bench_easy_nigeria", "cultural_bench_easy_pakistan", "cultural_bench_easy_peru", "cultural_bench_easy_philippines", "cultural_bench_easy_poland", "cultural_bench_easy_romania", "cultural_bench_easy_russia", "cultural_bench_easy_saudi_arabia", "cultural_bench_easy_singapore", "cultural_bench_easy_south_africa", "cultural_bench_easy_south_korea", "cultural_bench_easy_spain", "cultural_bench_easy_taiwan", "cultural_bench_easy_thailand", "cultural_bench_easy_turkey", "cultural_bench_easy_ukraine", "cultural_bench_easy_united_kingdom", "cultural_bench_easy_united_states", "cultural_bench_easy_vietnam", "cultural_bench_easy_zimbabwe", |
11 | | - "cultural_bench_hard_argentina", "cultural_bench_hard_australia", "cultural_bench_hard_bangladesh", "cultural_bench_hard_brazil", "cultural_bench_hard_canada", "cultural_bench_hard_chile", "cultural_bench_hard_china", "cultural_bench_hard_czech_republic", "cultural_bench_hard_egypt", "cultural_bench_hard_france", "cultural_bench_hard_germany", "cultural_bench_hard_hong_kong", "cultural_bench_hard_india", "cultural_bench_hard_indonesia", "cultural_bench_hard_iran", "cultural_bench_hard_israel", "cultural_bench_hard_italy", "cultural_bench_hard_japan", "cultural_bench_hard_lebanon", "cultural_bench_hard_malaysia", "cultural_bench_hard_mexico", "cultural_bench_hard_morocco", "cultural_bench_hard_nepal", "cultural_bench_hard_netherlands", "cultural_bench_hard_new_zealand", "cultural_bench_hard_nigeria", "cultural_bench_hard_pakistan", "cultural_bench_hard_peru", "cultural_bench_hard_philippines", "cultural_bench_hard_poland", "cultural_bench_hard_romania", "cultural_bench_hard_russia", "cultural_bench_hard_saudi_arabia", "cultural_bench_hard_singapore", "cultural_bench_hard_south_africa", "cultural_bench_hard_south_korea", "cultural_bench_hard_spain", "cultural_bench_hard_taiwan", "cultural_bench_hard_thailand", "cultural_bench_hard_turkey", "cultural_bench_hard_ukraine", "cultural_bench_hard_united_kingdom", "cultural_bench_hard_united_states", "cultural_bench_hard_vietnam", "cultural_bench_hard_zimbabwe", |
| 11 | + "blend", |
| 12 | + "cultural_bench", |
12 | 13 | "truthfulqa", |
13 | 14 | "truthfulqa_ar", "truthfulqa_bn", "truthfulqa_ca", "truthfulqa_da", "truthfulqa_de", "truthfulqa_es", "truthfulqa_eu", "truthfulqa_fr", "truthfulqa_gu", "truthfulqa_hi", "truthfulqa_hr", "truthfulqa_hu", "truthfulqa_hy", "truthfulqa_id", "truthfulqa_it", "truthfulqa_kn", "truthfulqa_ml", "truthfulqa_mr", "truthfulqa_ne", "truthfulqa_nl", "truthfulqa_pt", "truthfulqa_ro", "truthfulqa_ru", "truthfulqa_sk", "truthfulqa_sr", "truthfulqa_sv", "truthfulqa_ta", "truthfulqa_te", "truthfulqa_uk", "truthfulqa_vi", "truthfulqa_zh" |
14 | 15 | ], |
15 | | - "knowledge_test": [ |
| 16 | + "factual_test": [ |
16 | 17 | "mmlu_pro", "agieval", |
17 | 18 | "mmlu_prox_ar", "mmlu_prox_bn", "mmlu_prox_de", "mmlu_prox_en", "mmlu_prox_es", "mmlu_prox_fr", "mmlu_prox_hi", "mmlu_prox_ja", "mmlu_prox_ko", "mmlu_prox_pt", "mmlu_prox_sw", "mmlu_prox_th", "mmlu_prox_zh" |
18 | 19 | ], |
19 | 20 | "sft_dev": [ |
20 | | - "ifeval", |
21 | | - "hellaswag", "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi" |
| 21 | + "ifeval" |
22 | 22 | ], |
23 | 23 | "alignment_dev": [ |
24 | 24 | "realtoxicityprompts", |
25 | 25 | "polyglotoxicityprompts_full_arabic", "polyglotoxicityprompts_full_czech", "polyglotoxicityprompts_full_german", "polyglotoxicityprompts_full_english", "polyglotoxicityprompts_full_spanish", "polyglotoxicityprompts_full_french", "polyglotoxicityprompts_full_hindi", "polyglotoxicityprompts_full_indonesian", "polyglotoxicityprompts_full_italian", "polyglotoxicityprompts_full_japanese", "polyglotoxicityprompts_full_korean", "polyglotoxicityprompts_full_dutch", "polyglotoxicityprompts_full_polish", "polyglotoxicityprompts_full_portuguese", "polyglotoxicityprompts_full_russian", "polyglotoxicityprompts_full_swedish", "polyglotoxicityprompts_full_chinese", |
26 | 26 | "multijail_en", "multijail_ar", "multijail_ko", "multijail_th", "multijail_bn", "multijail_sw", "multijail_jv", "multijail_it", "multijail_vi", |
27 | 27 | "aya_redteaming_arabic", "aya_redteaming_english", "aya_redteaming_filipino", "aya_redteaming_french", "aya_redteaming_hindi", "aya_redteaming_russian", "aya_redteaming_serbian", "aya_redteaming_spanish", |
| 28 | + "harmbench", |
28 | 29 | "toxigen", |
29 | 30 | "bbq" |
30 | 31 | ], |
|
38 | 39 | "mbpp" |
39 | 40 | ], |
40 | 41 | "reasoning_test": [ |
41 | | - "arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh", "arc", |
42 | 42 | "gpqa_main_cot_zeroshot", |
43 | 43 | "gpqa_diamond_cot_zeroshot", |
44 | 44 | "gpqa_extended_cot_zeroshot", |
45 | | - "gsm8k_platinum_cot_zeroshot" |
| 45 | + "gsm8k_platinum_cot_zeroshot", |
| 46 | + "longbench_hotpotqa" |
46 | 47 | ] |
47 | 48 | }, |
48 | 49 | "other": [ |
|
0 commit comments