forked from mlfoundations/dclm
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluation_llm360_amber_heavy.json
More file actions
85 lines (85 loc) · 3.84 KB
/
evaluation_llm360_amber_heavy.json
File metadata and controls
85 lines (85 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
{
"name": "eval/heavy",
"uuid": "c74f73f2-383e-4ccb-a01f-ed1fac73caeb",
"model": "LLM360/Amber",
"creation_date": "2024_06_03-03_11_48",
"eval_metrics": {
"icl": {
"mmlu_zeroshot": 0.2707244845335944,
"hellaswag_zeroshot": 0.7125074863433838,
"jeopardy": 0.42513042986392974,
"bigbench_qa_wikidata": 0.6998178958892822,
"arc_easy": 0.7032828330993652,
"arc_challenge": 0.39761093258857727,
"mmlu_fewshot": 0.2790033432998155,
"bigbench_misconceptions": 0.4840182662010193,
"copa": 0.7400000095367432,
"siqa": 0.5424769520759583,
"commonsense_qa": 0.345618337392807,
"piqa": 0.7921653985977173,
"openbook_qa": 0.3840000033378601,
"bigbench_novel_concepts": 0.5625,
"bigbench_strange_stories": 0.5344827771186829,
"bigbench_strategy_qa": 0.5495849847793579,
"lambada_openai": 0.6819328665733337,
"hellaswag": 0.7261501550674438,
"winograd": 0.8241758346557617,
"winogrande": 0.6456195712089539,
"bigbench_conlang_translation": 0.05487804859876633,
"bigbench_language_identification": 0.25380000472068787,
"bigbench_conceptual_combinations": 0.34951457381248474,
"bigbench_elementary_math_qa": 0.263600617647171,
"bigbench_dyck_languages": 0.2370000034570694,
"agi_eval_lsat_ar": 0.27826085686683655,
"bigbench_cs_algorithms": 0.49318182468414307,
"bigbench_logical_deduction": 0.24400000274181366,
"bigbench_operators": 0.3571428656578064,
"bigbench_repeat_copy_logic": 0.09375,
"simple_arithmetic_nospaces": 0.1379999965429306,
"simple_arithmetic_withspaces": 0.15000000596046448,
"math_qa": 0.2567884624004364,
"logi_qa": 0.25652840733528137,
"pubmed_qa_labeled": 0.6150000095367432,
"squad": 0.5237464308738708,
"agi_eval_lsat_rc": 0.25,
"agi_eval_lsat_lr": 0.2607843279838562,
"coqa": 0.3974696099758148,
"bigbench_understanding_fables": 0.22751322388648987,
"boolq": 0.6792048811912537,
"agi_eval_sat_en": 0.25242719054222107,
"winogender_mc_female": 0.46666666865348816,
"winogender_mc_male": 0.550000011920929,
"enterprise_pii_classification": 0.5431516766548157,
"bbq": 0.4496057792143388,
"gpqa_main": 0.2276785671710968,
"gpqa_diamond": 0.24747474491596222,
"gsm8k_cot": 0.042456407099962234,
"agi_eval_sat_math_cot": 0.027272727340459824,
"aqua_cot": 0.020408162847161293,
"svamp_cot": 0.15666666626930237,
"triviaqa_sm_sub": 0.4203333258628845
}
},
"aggregated_task_categories_centered": {
"commonsense reasoning": 0.29504241647970475,
"language understanding": 0.40496615777294204,
"reading comprehension": 0.233515820400626,
"safety": 0.004712068221785798,
"symbolic problem solving": 0.13338245652684064,
"world knowledge": 0.23476847248293503
},
"aggregated_centered_results": 0.22331066043104308,
"aggregated_results": 0.3978322385760401,
"rw_small": 0.6463709523280462,
"rw_small_centered": 0.39081830274291907,
"95%_CI_above": 0.5165590592212491,
"95%_CI_above_centered": 0.35168530357243155,
"99%_CI_above": 0.5346052136110223,
"99%_CI_above_centered": 0.4035413091370024,
"low_variance_datasets": 0.5177985559810292,
"low_variance_datasets_centered": 0.3981474846494972,
"_filename": "exp_data/evals/evaluation_llm360_amber_heavy.json",
"missing tasks": "[]",
"Core": 0.3981474846494972,
"Extended": 0.22331066043104308
}