-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmetrics.json
More file actions
116 lines (116 loc) · 2.34 KB
/
metrics.json
File metadata and controls
116 lines (116 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
{
"main": {
"accuracy": 52.15,
"variance": 1.8228,
"n": 1369,
"calibration_error": 36.54
},
"alibaba/qwen3-max": {
"accuracy": 17.31,
"variance": 27.5233,
"n": 52,
"calibration_error": 54.45
},
"mistral/mistral-large": {
"accuracy": 5.88,
"variance": 8.1417,
"n": 68,
"calibration_error": 66.34
},
"zai/glm-4.6": {
"accuracy": 23.08,
"variance": 34.1375,
"n": 52,
"calibration_error": 49.63
},
"anthropic/claude-opus-4.5": {
"accuracy": 29.66,
"variance": 1.5628,
"n": 1335,
"calibration_error": 53.39
},
"anthropic/claude-sonnet-4.5": {
"accuracy": 18.11,
"variance": 1.1779,
"n": 1259,
"calibration_error": 60.44
},
"google/gemini-3-pro-preview": {
"accuracy": 44.74,
"variance": 1.8561,
"n": 1332,
"calibration_error": 61.93
},
"openai/gpt-5.1": {
"accuracy": 38.23,
"variance": 1.8348,
"n": 1287,
"calibration_error": 44.45
},
"xai/grok-4": {
"accuracy": 29.05,
"variance": 1.7878,
"n": 1153,
"calibration_error": 50.6
},
"mistral/magistral-medium": {
"accuracy": 13.51,
"variance": 31.5875,
"n": 37,
"calibration_error": 46.11
},
"alibaba/qwen3-next-80b-a3b-thinking": {
"accuracy": 17.5,
"variance": 18.0469,
"n": 80,
"calibration_error": 50.7
},
"minimax/minimax-m2": {
"accuracy": 16.67,
"variance": 231.4815,
"n": 6,
"calibration_error": null
},
"moonshotai/kimi-k2-thinking-turbo": {
"accuracy": 17.55,
"variance": 1.1652,
"n": 1242,
"calibration_error": 69.63
},
"openai/gpt-5-pro": {
"accuracy": 39.53,
"variance": 2.002,
"n": 1194,
"calibration_error": 48.37
},
"google/gemini-2.5-flash": {
"accuracy": 0.0,
"variance": 0.0,
"n": 2,
"calibration_error": null
},
"alibaba/qwen3-vl-thinking": {
"accuracy": 33.33,
"variance": 740.7407,
"n": 3,
"calibration_error": null
},
"google/gemini-2.5-pro": {
"accuracy": 16.51,
"variance": 1.0991,
"n": 1254,
"calibration_error": 76.77
},
"deepseek/deepseek-v3.2-thinking": {
"accuracy": 24.13,
"variance": 1.5606,
"n": 1173,
"calibration_error": 53.45
},
"deepseek/deepseek-v3.2-exp-thinking": {
"accuracy": 0.0,
"variance": 0.0,
"n": 1,
"calibration_error": null
}
}