Skip to content

Commit f9ed148

Browse files
committed
gemini/claude eval results + visualization notebook
1 parent 3e4a53d commit f9ed148

7 files changed

+1994
-321
lines changed

evaluation/eval_results/simple_qa_250_claude-3.5-sonnet-agent_results.jsonl

Lines changed: 158 additions & 158 deletions
Large diffs are not rendered by default.
Lines changed: 87 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,95 @@
1-
{"question_id": "simple_qa_0", "grade": "B"}
1+
{"question_id": "simple_qa_3", "grade": "A"}
22
{"question_id": "simple_qa_2", "grade": "C"}
3+
{"question_id": "simple_qa_0", "grade": "B"}
34
{"question_id": "simple_qa_1", "grade": "B"}
4-
{"question_id": "simple_qa_3", "grade": "A"}
5-
{"question_id": "simple_qa_6", "grade": "B"}
65
{"question_id": "simple_qa_4", "grade": "A"}
7-
{"question_id": "simple_qa_5", "grade": "C"}
86
{"question_id": "simple_qa_7", "grade": "A"}
7+
{"question_id": "simple_qa_5", "grade": "C"}
8+
{"question_id": "simple_qa_6", "grade": "B"}
99
{"question_id": "simple_qa_8", "grade": "A"}
10-
{"question_id": "simple_qa_9", "grade": "C"}
11-
{"question_id": "simple_qa_11", "grade": "B"}
1210
{"question_id": "simple_qa_10", "grade": "B"}
13-
{"question_id": "simple_qa_12", "grade": "B"}
11+
{"question_id": "simple_qa_11", "grade": "B"}
12+
{"question_id": "simple_qa_9", "grade": "C"}
1413
{"question_id": "simple_qa_13", "grade": "B"}
1514
{"question_id": "simple_qa_14", "grade": "C"}
1615
{"question_id": "simple_qa_15", "grade": "C"}
1716
{"question_id": "simple_qa_16", "grade": "B"}
17+
{"question_id": "simple_qa_12", "grade": "B"}
1818
{"question_id": "simple_qa_17", "grade": "A"}
1919
{"question_id": "simple_qa_18", "grade": "B"}
2020
{"question_id": "simple_qa_19", "grade": "C"}
21-
{"question_id": "simple_qa_21", "grade": "B"}
22-
{"question_id": "simple_qa_23", "grade": "B"}
21+
{"question_id": "simple_qa_20", "grade": "C"}
2322
{"question_id": "simple_qa_22", "grade": "A"}
2423
{"question_id": "simple_qa_24", "grade": "C"}
24+
{"question_id": "simple_qa_23", "grade": "B"}
2525
{"question_id": "simple_qa_25", "grade": "C"}
26-
{"question_id": "simple_qa_26", "grade": "B"}
26+
{"question_id": "simple_qa_21", "grade": "B"}
2727
{"question_id": "simple_qa_27", "grade": "B"}
28-
{"question_id": "simple_qa_29", "grade": "C"}
2928
{"question_id": "simple_qa_28", "grade": "A"}
30-
{"question_id": "simple_qa_20", "grade": "C"}
31-
{"question_id": "simple_qa_31", "grade": "C"}
29+
{"question_id": "simple_qa_29", "grade": "C"}
3230
{"question_id": "simple_qa_30", "grade": "A"}
31+
{"question_id": "simple_qa_31", "grade": "C"}
3332
{"question_id": "simple_qa_32", "grade": "A"}
3433
{"question_id": "simple_qa_33", "grade": "B"}
3534
{"question_id": "simple_qa_34", "grade": "C"}
3635
{"question_id": "simple_qa_35", "grade": "B"}
37-
{"question_id": "simple_qa_37", "grade": "B"}
3836
{"question_id": "simple_qa_36", "grade": "A"}
37+
{"question_id": "simple_qa_37", "grade": "B"}
3938
{"question_id": "simple_qa_38", "grade": "B"}
4039
{"question_id": "simple_qa_39", "grade": "B"}
4140
{"question_id": "simple_qa_40", "grade": "C"}
42-
{"question_id": "simple_qa_41", "grade": "C"}
41+
{"question_id": "simple_qa_26", "grade": "B"}
4342
{"question_id": "simple_qa_42", "grade": "C"}
44-
{"question_id": "simple_qa_43", "grade": "A"}
43+
{"question_id": "simple_qa_41", "grade": "C"}
4544
{"question_id": "simple_qa_44", "grade": "B"}
45+
{"question_id": "simple_qa_43", "grade": "A"}
4646
{"question_id": "simple_qa_45", "grade": "C"}
4747
{"question_id": "simple_qa_46", "grade": "B"}
48-
{"question_id": "simple_qa_47", "grade": "B"}
4948
{"question_id": "simple_qa_49", "grade": "C"}
5049
{"question_id": "simple_qa_48", "grade": "C"}
50+
{"question_id": "simple_qa_47", "grade": "B"}
5151
{"question_id": "simple_qa_50", "grade": "C"}
52+
{"question_id": "simple_qa_53", "grade": "C"}
5253
{"question_id": "simple_qa_51", "grade": "B"}
5354
{"question_id": "simple_qa_52", "grade": "A"}
54-
{"question_id": "simple_qa_53", "grade": "C"}
5555
{"question_id": "simple_qa_54", "grade": "A"}
5656
{"question_id": "simple_qa_55", "grade": "B"}
57-
{"question_id": "simple_qa_56", "grade": "A"}
5857
{"question_id": "simple_qa_57", "grade": "C"}
58+
{"question_id": "simple_qa_56", "grade": "A"}
5959
{"question_id": "simple_qa_59", "grade": "C"}
6060
{"question_id": "simple_qa_58", "grade": "B"}
6161
{"question_id": "simple_qa_61", "grade": "C"}
6262
{"question_id": "simple_qa_60", "grade": "A"}
6363
{"question_id": "simple_qa_62", "grade": "A"}
64+
{"question_id": "simple_qa_64", "grade": "A"}
6465
{"question_id": "simple_qa_63", "grade": "B"}
6566
{"question_id": "simple_qa_65", "grade": "A"}
66-
{"question_id": "simple_qa_64", "grade": "A"}
6767
{"question_id": "simple_qa_66", "grade": "A"}
68-
{"question_id": "simple_qa_67", "grade": "C"}
69-
{"question_id": "simple_qa_68", "grade": "C"}
7068
{"question_id": "simple_qa_69", "grade": "A"}
71-
{"question_id": "simple_qa_70", "grade": "C"}
72-
{"question_id": "simple_qa_71", "grade": "C"}
69+
{"question_id": "simple_qa_68", "grade": "C"}
70+
{"question_id": "simple_qa_67", "grade": "C"}
7371
{"question_id": "simple_qa_72", "grade": "A"}
72+
{"question_id": "simple_qa_71", "grade": "C"}
7473
{"question_id": "simple_qa_73", "grade": "A"}
75-
{"question_id": "simple_qa_74", "grade": "B"}
74+
{"question_id": "simple_qa_70", "grade": "C"}
7675
{"question_id": "simple_qa_75", "grade": "B"}
76+
{"question_id": "simple_qa_74", "grade": "B"}
7777
{"question_id": "simple_qa_77", "grade": "C"}
78+
{"question_id": "simple_qa_76", "grade": "B"}
7879
{"question_id": "simple_qa_78", "grade": "C"}
7980
{"question_id": "simple_qa_79", "grade": "A"}
80-
{"question_id": "simple_qa_76", "grade": "B"}
81-
{"question_id": "simple_qa_81", "grade": "C"}
8281
{"question_id": "simple_qa_80", "grade": "A"}
82+
{"question_id": "simple_qa_81", "grade": "C"}
8383
{"question_id": "simple_qa_83", "grade": "B"}
8484
{"question_id": "simple_qa_82", "grade": "B"}
85-
{"question_id": "simple_qa_84", "grade": "C"}
8685
{"question_id": "simple_qa_85", "grade": "A"}
87-
{"question_id": "simple_qa_86", "grade": "A"}
86+
{"question_id": "simple_qa_84", "grade": "C"}
8887
{"question_id": "simple_qa_87", "grade": "A"}
89-
{"question_id": "simple_qa_88", "grade": "C"}
88+
{"question_id": "simple_qa_86", "grade": "A"}
9089
{"question_id": "simple_qa_89", "grade": "C"}
91-
{"question_id": "simple_qa_90", "grade": "C"}
90+
{"question_id": "simple_qa_88", "grade": "C"}
9291
{"question_id": "simple_qa_91", "grade": "A"}
92+
{"question_id": "simple_qa_90", "grade": "C"}
9393
{"question_id": "simple_qa_92", "grade": "A"}
9494
{"question_id": "simple_qa_93", "grade": "B"}
9595
{"question_id": "simple_qa_94", "grade": "A"}
@@ -99,152 +99,152 @@
9999
{"question_id": "simple_qa_98", "grade": "C"}
100100
{"question_id": "simple_qa_100", "grade": "C"}
101101
{"question_id": "simple_qa_99", "grade": "A"}
102-
{"question_id": "simple_qa_101", "grade": "A"}
103102
{"question_id": "simple_qa_102", "grade": "B"}
104103
{"question_id": "simple_qa_104", "grade": "B"}
104+
{"question_id": "simple_qa_103", "grade": "B"}
105105
{"question_id": "simple_qa_105", "grade": "B"}
106-
{"question_id": "simple_qa_106", "grade": "C"}
107-
{"question_id": "simple_qa_107", "grade": "B"}
108106
{"question_id": "simple_qa_108", "grade": "B"}
107+
{"question_id": "simple_qa_107", "grade": "B"}
108+
{"question_id": "simple_qa_106", "grade": "C"}
109+
{"question_id": "simple_qa_110", "grade": "B"}
110+
{"question_id": "simple_qa_101", "grade": "A"}
109111
{"question_id": "simple_qa_109", "grade": "C"}
110112
{"question_id": "simple_qa_111", "grade": "C"}
111-
{"question_id": "simple_qa_110", "grade": "B"}
112-
{"question_id": "simple_qa_112", "grade": "C"}
113113
{"question_id": "simple_qa_113", "grade": "C"}
114+
{"question_id": "simple_qa_112", "grade": "C"}
114115
{"question_id": "simple_qa_114", "grade": "B"}
115-
{"question_id": "simple_qa_115", "grade": "C"}
116116
{"question_id": "simple_qa_116", "grade": "B"}
117+
{"question_id": "simple_qa_115", "grade": "C"}
117118
{"question_id": "simple_qa_117", "grade": "A"}
118119
{"question_id": "simple_qa_118", "grade": "C"}
119120
{"question_id": "simple_qa_119", "grade": "A"}
120121
{"question_id": "simple_qa_120", "grade": "B"}
121-
{"question_id": "simple_qa_121", "grade": "B"}
122122
{"question_id": "simple_qa_122", "grade": "C"}
123123
{"question_id": "simple_qa_123", "grade": "A"}
124+
{"question_id": "simple_qa_121", "grade": "B"}
124125
{"question_id": "simple_qa_124", "grade": "C"}
125126
{"question_id": "simple_qa_125", "grade": "A"}
126127
{"question_id": "simple_qa_126", "grade": "A"}
127-
{"question_id": "simple_qa_127", "grade": "B"}
128-
{"question_id": "simple_qa_103", "grade": "B"}
129128
{"question_id": "simple_qa_128", "grade": "C"}
130-
{"question_id": "simple_qa_131", "grade": "C"}
131-
{"question_id": "simple_qa_129", "grade": "B"}
129+
{"question_id": "simple_qa_127", "grade": "B"}
132130
{"question_id": "simple_qa_130", "grade": "A"}
131+
{"question_id": "simple_qa_129", "grade": "B"}
133132
{"question_id": "simple_qa_132", "grade": "B"}
133+
{"question_id": "simple_qa_131", "grade": "C"}
134134
{"question_id": "simple_qa_133", "grade": "A"}
135135
{"question_id": "simple_qa_135", "grade": "A"}
136136
{"question_id": "simple_qa_134", "grade": "A"}
137-
{"question_id": "simple_qa_137", "grade": "C"}
138137
{"question_id": "simple_qa_136", "grade": "B"}
138+
{"question_id": "simple_qa_137", "grade": "C"}
139139
{"question_id": "simple_qa_138", "grade": "B"}
140140
{"question_id": "simple_qa_139", "grade": "B"}
141141
{"question_id": "simple_qa_140", "grade": "C"}
142-
{"question_id": "simple_qa_141", "grade": "C"}
142+
{"question_id": "simple_qa_141", "grade": "B"}
143143
{"question_id": "simple_qa_142", "grade": "C"}
144-
{"question_id": "simple_qa_144", "grade": "A"}
145144
{"question_id": "simple_qa_143", "grade": "C"}
146145
{"question_id": "simple_qa_145", "grade": "B"}
146+
{"question_id": "simple_qa_144", "grade": "A"}
147147
{"question_id": "simple_qa_146", "grade": "C"}
148-
{"question_id": "simple_qa_147", "grade": "A"}
149148
{"question_id": "simple_qa_149", "grade": "C"}
150149
{"question_id": "simple_qa_148", "grade": "C"}
150+
{"question_id": "simple_qa_147", "grade": "A"}
151+
{"question_id": "simple_qa_152", "grade": "A"}
151152
{"question_id": "simple_qa_150", "grade": "B"}
152-
{"question_id": "simple_qa_154", "grade": "C"}
153153
{"question_id": "simple_qa_151", "grade": "B"}
154-
{"question_id": "simple_qa_152", "grade": "A"}
155154
{"question_id": "simple_qa_153", "grade": "C"}
156-
{"question_id": "simple_qa_157", "grade": "C"}
155+
{"question_id": "simple_qa_154", "grade": "C"}
157156
{"question_id": "simple_qa_156", "grade": "A"}
158157
{"question_id": "simple_qa_155", "grade": "C"}
158+
{"question_id": "simple_qa_157", "grade": "C"}
159159
{"question_id": "simple_qa_158", "grade": "C"}
160-
{"question_id": "simple_qa_160", "grade": "B"}
161160
{"question_id": "simple_qa_159", "grade": "B"}
161+
{"question_id": "simple_qa_160", "grade": "B"}
162162
{"question_id": "simple_qa_162", "grade": "A"}
163-
{"question_id": "simple_qa_161", "grade": "C"}
164-
{"question_id": "simple_qa_164", "grade": "B"}
165163
{"question_id": "simple_qa_163", "grade": "B"}
164+
{"question_id": "simple_qa_161", "grade": "C"}
166165
{"question_id": "simple_qa_165", "grade": "A"}
166+
{"question_id": "simple_qa_164", "grade": "B"}
167+
{"question_id": "simple_qa_167", "grade": "C"}
167168
{"question_id": "simple_qa_166", "grade": "A"}
168-
{"question_id": "simple_qa_169", "grade": "C"}
169169
{"question_id": "simple_qa_168", "grade": "B"}
170-
{"question_id": "simple_qa_170", "grade": "C"}
171-
{"question_id": "simple_qa_167", "grade": "C"}
170+
{"question_id": "simple_qa_169", "grade": "C"}
172171
{"question_id": "simple_qa_171", "grade": "A"}
172+
{"question_id": "simple_qa_170", "grade": "C"}
173173
{"question_id": "simple_qa_172", "grade": "B"}
174-
{"question_id": "simple_qa_174", "grade": "C"}
174+
{"question_id": "simple_qa_173", "grade": "C"}
175175
{"question_id": "simple_qa_175", "grade": "A"}
176-
{"question_id": "simple_qa_177", "grade": "C"}
176+
{"question_id": "simple_qa_174", "grade": "C"}
177177
{"question_id": "simple_qa_176", "grade": "B"}
178+
{"question_id": "simple_qa_177", "grade": "C"}
178179
{"question_id": "simple_qa_178", "grade": "A"}
179-
{"question_id": "simple_qa_180", "grade": "C"}
180180
{"question_id": "simple_qa_179", "grade": "C"}
181+
{"question_id": "simple_qa_180", "grade": "C"}
181182
{"question_id": "simple_qa_182", "grade": "B"}
182183
{"question_id": "simple_qa_183", "grade": "B"}
183-
{"question_id": "simple_qa_184", "grade": "A"}
184184
{"question_id": "simple_qa_185", "grade": "B"}
185185
{"question_id": "simple_qa_181", "grade": "A"}
186186
{"question_id": "simple_qa_186", "grade": "B"}
187+
{"question_id": "simple_qa_189", "grade": "B"}
187188
{"question_id": "simple_qa_187", "grade": "C"}
188189
{"question_id": "simple_qa_188", "grade": "A"}
189-
{"question_id": "simple_qa_189", "grade": "B"}
190-
{"question_id": "simple_qa_191", "grade": "B"}
191190
{"question_id": "simple_qa_190", "grade": "C"}
192-
{"question_id": "simple_qa_173", "grade": "C"}
193-
{"question_id": "simple_qa_195", "grade": "C"}
191+
{"question_id": "simple_qa_192", "grade": "A"}
192+
{"question_id": "simple_qa_191", "grade": "B"}
194193
{"question_id": "simple_qa_193", "grade": "C"}
195-
{"question_id": "simple_qa_197", "grade": "C"}
194+
{"question_id": "simple_qa_195", "grade": "C"}
195+
{"question_id": "simple_qa_194", "grade": "C"}
196+
{"question_id": "simple_qa_196", "grade": "C"}
197+
{"question_id": "simple_qa_184", "grade": "A"}
196198
{"question_id": "simple_qa_198", "grade": "B"}
199+
{"question_id": "simple_qa_197", "grade": "C"}
197200
{"question_id": "simple_qa_199", "grade": "A"}
198-
{"question_id": "simple_qa_200", "grade": "B"}
199-
{"question_id": "simple_qa_201", "grade": "A"}
200-
{"question_id": "simple_qa_192", "grade": "A"}
201-
{"question_id": "simple_qa_196", "grade": "C"}
202201
{"question_id": "simple_qa_202", "grade": "A"}
203-
{"question_id": "simple_qa_203", "grade": "C"}
202+
{"question_id": "simple_qa_201", "grade": "A"}
203+
{"question_id": "simple_qa_200", "grade": "B"}
204204
{"question_id": "simple_qa_204", "grade": "A"}
205-
{"question_id": "simple_qa_207", "grade": "A"}
205+
{"question_id": "simple_qa_203", "grade": "C"}
206206
{"question_id": "simple_qa_205", "grade": "A"}
207+
{"question_id": "simple_qa_207", "grade": "A"}
207208
{"question_id": "simple_qa_206", "grade": "A"}
208209
{"question_id": "simple_qa_208", "grade": "C"}
209210
{"question_id": "simple_qa_209", "grade": "A"}
210-
{"question_id": "simple_qa_210", "grade": "B"}
211-
{"question_id": "simple_qa_212", "grade": "A"}
212211
{"question_id": "simple_qa_211", "grade": "C"}
213-
{"question_id": "simple_qa_213", "grade": "A"}
212+
{"question_id": "simple_qa_210", "grade": "B"}
214213
{"question_id": "simple_qa_214", "grade": "C"}
214+
{"question_id": "simple_qa_213", "grade": "A"}
215+
{"question_id": "simple_qa_212", "grade": "A"}
215216
{"question_id": "simple_qa_215", "grade": "B"}
216-
{"question_id": "simple_qa_216", "grade": "B"}
217-
{"question_id": "simple_qa_194", "grade": "C"}
218217
{"question_id": "simple_qa_217", "grade": "A"}
218+
{"question_id": "simple_qa_216", "grade": "B"}
219219
{"question_id": "simple_qa_218", "grade": "C"}
220220
{"question_id": "simple_qa_219", "grade": "C"}
221221
{"question_id": "simple_qa_220", "grade": "C"}
222-
{"question_id": "simple_qa_221", "grade": "C"}
223222
{"question_id": "simple_qa_222", "grade": "B"}
224-
{"question_id": "simple_qa_224", "grade": "A"}
225-
{"question_id": "simple_qa_223", "grade": "B"}
223+
{"question_id": "simple_qa_221", "grade": "C"}
226224
{"question_id": "simple_qa_225", "grade": "A"}
227225
{"question_id": "simple_qa_226", "grade": "C"}
228-
{"question_id": "simple_qa_227", "grade": "C"}
226+
{"question_id": "simple_qa_224", "grade": "A"}
229227
{"question_id": "simple_qa_228", "grade": "B"}
228+
{"question_id": "simple_qa_223", "grade": "B"}
229+
{"question_id": "simple_qa_227", "grade": "C"}
230+
{"question_id": "simple_qa_229", "grade": "B"}
230231
{"question_id": "simple_qa_230", "grade": "C"}
231-
{"question_id": "simple_qa_231", "grade": "C"}
232-
{"question_id": "simple_qa_232", "grade": "A"}
233232
{"question_id": "simple_qa_233", "grade": "B"}
233+
{"question_id": "simple_qa_231", "grade": "C"}
234234
{"question_id": "simple_qa_234", "grade": "A"}
235+
{"question_id": "simple_qa_232", "grade": "A"}
235236
{"question_id": "simple_qa_235", "grade": "C"}
236-
{"question_id": "simple_qa_236", "grade": "B"}
237-
{"question_id": "simple_qa_229", "grade": "B"}
238237
{"question_id": "simple_qa_237", "grade": "C"}
238+
{"question_id": "simple_qa_236", "grade": "B"}
239239
{"question_id": "simple_qa_238", "grade": "B"}
240-
{"question_id": "simple_qa_241", "grade": "A"}
241240
{"question_id": "simple_qa_239", "grade": "B"}
242-
{"question_id": "simple_qa_240", "grade": "B"}
243241
{"question_id": "simple_qa_242", "grade": "C"}
242+
{"question_id": "simple_qa_240", "grade": "B"}
243+
{"question_id": "simple_qa_241", "grade": "A"}
244+
{"question_id": "simple_qa_244", "grade": "A"}
244245
{"question_id": "simple_qa_245", "grade": "A"}
245-
{"question_id": "simple_qa_246", "grade": "B"}
246246
{"question_id": "simple_qa_243", "grade": "B"}
247-
{"question_id": "simple_qa_244", "grade": "A"}
247+
{"question_id": "simple_qa_246", "grade": "B"}
248248
{"question_id": "simple_qa_247", "grade": "B"}
249249
{"question_id": "simple_qa_248", "grade": "A"}
250250
{"question_id": "simple_qa_249", "grade": "C"}

0 commit comments

Comments
 (0)