Skip to content

Commit c907869

Browse files
committed
analysis notebooks
1 parent 068e94f commit c907869

33 files changed

+5273
-304
lines changed

analysis/commitment_metrics.ipynb

Lines changed: 993 additions & 0 deletions
Large diffs are not rendered by default.

analysis/foqa_viz.ipynb

Lines changed: 282 additions & 0 deletions
Large diffs are not rendered by default.

analysis/img/foqa_grouped.pdf

19.6 KB
Binary file not shown.

analysis/img/tp_grouped.pdf

15.9 KB
Binary file not shown.
Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
"metadata": {
1313
"collapsed": true,
1414
"ExecuteTime": {
15-
"end_time": "2024-12-02T14:41:25.972082Z",
16-
"start_time": "2024-12-02T14:41:25.749960Z"
15+
"end_time": "2025-12-03T18:34:12.116429Z",
16+
"start_time": "2025-12-03T18:34:11.848275Z"
1717
}
1818
},
1919
"source": [
@@ -38,8 +38,8 @@
3838
{
3939
"metadata": {
4040
"ExecuteTime": {
41-
"end_time": "2024-12-02T14:41:25.974662Z",
42-
"start_time": "2024-12-02T14:41:25.973144Z"
41+
"end_time": "2025-12-03T18:34:12.121834Z",
42+
"start_time": "2025-12-03T18:34:12.120497Z"
4343
}
4444
},
4545
"cell_type": "code",
@@ -56,8 +56,8 @@
5656
{
5757
"metadata": {
5858
"ExecuteTime": {
59-
"end_time": "2024-12-02T14:41:25.979243Z",
60-
"start_time": "2024-12-02T14:41:25.975169Z"
59+
"end_time": "2025-12-03T18:34:14.487901Z",
60+
"start_time": "2025-12-03T18:34:14.484170Z"
6161
}
6262
},
6363
"cell_type": "code",
@@ -126,8 +126,8 @@
126126
{
127127
"metadata": {
128128
"ExecuteTime": {
129-
"end_time": "2024-12-02T14:41:27.035496Z",
130-
"start_time": "2024-12-02T14:41:25.980586Z"
129+
"end_time": "2025-12-03T18:34:16.768570Z",
130+
"start_time": "2025-12-03T18:34:16.405943Z"
131131
}
132132
},
133133
"cell_type": "code",
@@ -166,27 +166,13 @@
166166
"Avg prompt tokens del: 28171.0 ($0.0140855)\n",
167167
"Avg output tokens del: 509.6818181818182 ($0.0007645227272727273)\n",
168168
"N: 308\n",
169-
"Total cost: $9.200377000000001\n",
170-
"========== /Users/andrew/Desktop/Code/_penn/redel-experiments/experiments/travelplanner/validation/small-leaf ==========\n",
171-
"Avg prompt tokens root: 7912.888888888889 ($0.039564444444444444)\n",
172-
"Avg output tokens root: 1018.7333333333333 ($0.015281)\n",
173-
"Avg prompt tokens del: 9192.044444444444 ($0.0045960222222222215)\n",
174-
"Avg output tokens del: 1121.0833333333333 ($0.001681625)\n",
175-
"N: 180\n",
176-
"Total cost: $11.0021565\n",
177-
"========== /Users/andrew/Desktop/Code/_penn/redel-experiments/experiments/webarena/test/small-leaf ==========\n",
178-
"Avg prompt tokens root: 55456.268 ($0.27728134)\n",
179-
"Avg output tokens root: 171.884 ($0.0025782599999999998)\n",
180-
"Avg prompt tokens del: 222370.916 ($0.111185458)\n",
181-
"Avg output tokens del: 740.764 ($0.001111146)\n",
182-
"N: 250\n",
183-
"Total cost: $98.039051\n"
169+
"Total cost: $9.200377000000001\n"
184170
]
185171
},
186172
{
187173
"data": {
188174
"text/plain": [
189-
"(98.039051, 250)"
175+
"(0, 0)"
190176
]
191177
},
192178
"execution_count": 4,

analysis/tp_parseable.csv

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
family,system,delivery rate,cs micro,cs macro,hard micro,hard macro,hard micro fix,hard macro fix,final
2+
openai,full,100.000,69.931,3.889,9.524,7.778,80.714,66.667,2.778
3+
openai,root-fc,100.000,66.250,0.556,21.905,14.444,49.524,31.667,0.000
4+
openai,baseline,99.444,66.389,0.556,23.333,13.333,52.143,37.778,0.000
5+
openai,small-leaf,100.000,68.264,0.556,2.143,1.111,74.762,55.000,0.556
6+
openai,small-all,99.444,55.000,0.000,0.000,0.000,88.095,73.333,0.000
7+
openai,small-baseline,91.111,49.861,0.000,0.238,0.000,53.333,35.000,0.000
8+
openai,short-context,99.444,61.667,1.667,7.857,6.667,,,1.111
9+
openai,short-baseline,98.889,64.861,2.222,25.476,13.889,,,0.556
10+
cohere-hf,full,84.444,52.153,1.111,1.190,1.111,,,0.000
11+
cohere-hf,root-fc,92.778,64.931,3.333,7.619,2.222,,,0.000
12+
cohere-hf,baseline,93.333,65.069,5.556,5.476,2.778,,,0.556
13+
cohere-hf,small-leaf,,,,,,,,
14+
cohere-hf,small-all,53.889,30.764,0.556,0.952,1.111,,,0.556
15+
cohere-hf,small-baseline,97.222,60.556,1.667,1.905,1.667,,,0.000
16+
cohere-hf,short-context,76.111,46.181,0.000,0.238,0.556,,,0.000
17+
cohere-hf,short-baseline,95.000,63.889,5.000,6.429,4.444,,,2.222
18+
glm,full,100.000,72.778,0.000,0.000,0.000,,,0.000
19+
glm,root-fc,100.000,65.417,6.111,28.571,25.000,,,2.778
20+
glm,baseline,100.000,67.917,8.889,39.048,27.778,,,5.556
21+
glm,small-leaf,,,,,,,,
22+
glm,small-all,,,,,,,,
23+
glm,small-baseline,,,,,,,,
24+
glm,short-context,100.000,72.778,0.000,0.000,0.000,,,0.000
25+
glm,short-baseline,100.000,56.250,5.556,17.619,12.222,,,2.778
26+
gpt-oss,full,100.000,57.569,0.000,0.000,0.000,,,0.000
27+
gpt-oss,root-fc,100.000,46.042,0.000,0.714,0.556,,,0.000
28+
gpt-oss,baseline,100.000,45.764,0.000,1.667,0.556,,,0.000
29+
gpt-oss,small-leaf,100.000,54.722,0.000,0.000,0.000,,,0.000
30+
gpt-oss,small-all,100.000,61.181,0.000,0.000,0.000,,,0.000
31+
gpt-oss,small-baseline,100.000,47.778,0.000,6.429,3.333,,,0.000
32+
gpt-oss,short-context,100.000,60.139,0.000,0.000,0.000,,,0.000
33+
gpt-oss,short-baseline,61.667,32.361,0.000,0.714,0.000,,,0.000
34+
qwen3,full,100.000,63.125,0.000,0.000,0.000,,,0.000
35+
qwen3,root-fc,100.000,57.083,3.889,17.381,7.222,,,2.222
36+
qwen3,baseline,100.000,58.958,1.111,15.238,6.111,,,0.556
37+
qwen3,small-leaf,100.000,63.056,0.000,0.000,0.000,,,0.000
38+
qwen3,small-all,100.000,55.764,0.000,0.000,0.000,,,0.000
39+
qwen3,small-baseline,100.000,56.597,0.000,5.476,3.333,,,0.000
40+
qwen3,short-context,100.000,62.500,0.000,0.000,0.000,,,0.000
41+
qwen3,short-baseline,100.000,56.944,0.000,11.667,5.556,,,0.000

0 commit comments

Comments
 (0)