|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "58086537", |
7 | 7 | "metadata": {}, |
8 | 8 | "outputs": [], |
|
25 | 25 | }, |
26 | 26 | { |
27 | 27 | "cell_type": "code", |
28 | | - "execution_count": 5, |
29 | | - "id": "7901cccc", |
30 | | - "metadata": {}, |
31 | | - "outputs": [ |
32 | | - { |
33 | | - "data": { |
34 | | - "text/plain": [ |
35 | | - "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results')" |
36 | | - ] |
37 | | - }, |
38 | | - "execution_count": 5, |
39 | | - "metadata": {}, |
40 | | - "output_type": "execute_result" |
41 | | - } |
42 | | - ], |
43 | | - "source": [ |
44 | | - "RESULTS_DIR" |
45 | | - ] |
46 | | - }, |
47 | | - { |
48 | | - "cell_type": "code", |
49 | | - "execution_count": 6, |
| 28 | + "execution_count": null, |
50 | 29 | "id": "50be19a9", |
51 | 30 | "metadata": {}, |
52 | | - "outputs": [ |
53 | | - { |
54 | | - "name": "stdout", |
55 | | - "output_type": "stream", |
56 | | - "text": [ |
57 | | - "/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create\n" |
58 | | - ] |
59 | | - }, |
60 | | - { |
61 | | - "name": "stderr", |
62 | | - "output_type": "stream", |
63 | | - "text": [ |
64 | | - "Searching experiments directories.: 100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]\n", |
65 | | - "Loading results: 100%|██████████| 1/1 [00:00<00:00, 373.26it/s]\n" |
66 | | - ] |
67 | | - } |
68 | | - ], |
| 31 | + "outputs": [], |
69 | 32 | "source": [ |
70 | 33 | "# replace this by your desired directory if needed.\n", |
71 | 34 | "result_dir = get_most_recent_study(RESULTS_DIR, contains=None)\n", |
|
76 | 39 | }, |
77 | 40 | { |
78 | 41 | "cell_type": "code", |
79 | | - "execution_count": 7, |
80 | | - "id": "82cc1557", |
81 | | - "metadata": {}, |
82 | | - "outputs": [ |
83 | | - { |
84 | | - "data": { |
85 | | - "text/plain": [ |
86 | | - "PosixPath('/Users/aman.jaiswal/Work/AgentLab.worktrees/trace-recorder/results/2025-09-02_15-52-00_hitl-genericagent-gpt-5-mini-2025-08-07-on-workarena-l1-task-name-create')" |
87 | | - ] |
88 | | - }, |
89 | | - "execution_count": 7, |
90 | | - "metadata": {}, |
91 | | - "output_type": "execute_result" |
92 | | - } |
93 | | - ], |
94 | | - "source": [ |
95 | | - "result_dir" |
96 | | - ] |
97 | | - }, |
98 | | - { |
99 | | - "cell_type": "code", |
100 | | - "execution_count": 3, |
| 42 | + "execution_count": null, |
101 | 43 | "id": "a424c470", |
102 | 44 | "metadata": {}, |
103 | | - "outputs": [ |
104 | | - { |
105 | | - "name": "stdout", |
106 | | - "output_type": "stream", |
107 | | - "text": [ |
108 | | - "Found multiple configuration, averaging across tasks and returning a per-agent report.\n" |
109 | | - ] |
110 | | - }, |
111 | | - { |
112 | | - "data": { |
113 | | - "text/html": [ |
114 | | - "<style type=\"text/css\">\n", |
115 | | - "#T_1d2fe th {\n", |
116 | | - " white-space: pre-wrap;\n", |
117 | | - "}\n", |
118 | | - "</style>\n", |
119 | | - "<table id=\"T_1d2fe\">\n", |
120 | | - " <thead>\n", |
121 | | - " <tr>\n", |
122 | | - " <th class=\"blank level0\" > </th>\n", |
123 | | - " <th id=\"T_1d2fe_level0_col0\" class=\"col_heading level0 col0\" >agent.agent\n", |
124 | | - "name</th>\n", |
125 | | - " <th id=\"T_1d2fe_level0_col1\" class=\"col_heading level0 col1\" >env.benchmark</th>\n", |
126 | | - " <th id=\"T_1d2fe_level0_col2\" class=\"col_heading level0 col2\" >avg\n", |
127 | | - "reward</th>\n", |
128 | | - " <th id=\"T_1d2fe_level0_col3\" class=\"col_heading level0 col3\" >std\n", |
129 | | - "err</th>\n", |
130 | | - " <th id=\"T_1d2fe_level0_col4\" class=\"col_heading level0 col4\" >avg\n", |
131 | | - "steps</th>\n", |
132 | | - " <th id=\"T_1d2fe_level0_col5\" class=\"col_heading level0 col5\" >n\n", |
133 | | - "completed</th>\n", |
134 | | - " <th id=\"T_1d2fe_level0_col6\" class=\"col_heading level0 col6\" >n\n", |
135 | | - "err</th>\n", |
136 | | - " </tr>\n", |
137 | | - " </thead>\n", |
138 | | - " <tbody>\n", |
139 | | - " <tr>\n", |
140 | | - " <th id=\"T_1d2fe_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n", |
141 | | - " <td id=\"T_1d2fe_row0_col0\" class=\"data row0 col0\" >HITL-GenericAgent-gpt-5-mini-2025-08-07</td>\n", |
142 | | - " <td id=\"T_1d2fe_row0_col1\" class=\"data row0 col1\" >workarena</td>\n", |
143 | | - " <td id=\"T_1d2fe_row0_col2\" class=\"data row0 col2\" >nan</td>\n", |
144 | | - " <td id=\"T_1d2fe_row0_col3\" class=\"data row0 col3\" >nan</td>\n", |
145 | | - " <td id=\"T_1d2fe_row0_col4\" class=\"data row0 col4\" >nan</td>\n", |
146 | | - " <td id=\"T_1d2fe_row0_col5\" class=\"data row0 col5\" >0/1</td>\n", |
147 | | - " <td id=\"T_1d2fe_row0_col6\" class=\"data row0 col6\" >0</td>\n", |
148 | | - " </tr>\n", |
149 | | - " </tbody>\n", |
150 | | - "</table>\n" |
151 | | - ], |
152 | | - "text/plain": [ |
153 | | - "<pandas.io.formats.style.Styler at 0x125c55850>" |
154 | | - ] |
155 | | - }, |
156 | | - "metadata": {}, |
157 | | - "output_type": "display_data" |
158 | | - } |
159 | | - ], |
| 45 | + "outputs": [], |
160 | 46 | "source": [ |
161 | 47 | "report = inspect_results.global_report(result_df)\n", |
162 | 48 | "inspect_results.display_report(report)" |
163 | 49 | ] |
164 | 50 | }, |
165 | | - { |
166 | | - "cell_type": "code", |
167 | | - "execution_count": null, |
168 | | - "id": "f86e44fd", |
169 | | - "metadata": {}, |
170 | | - "outputs": [ |
171 | | - { |
172 | | - "data": { |
173 | | - "application/vnd.microsoft.datawrangler.viewer.v0+json": { |
174 | | - "columns": [ |
175 | | - { |
176 | | - "name": "('agent.agent_name', 'env.benchmark')", |
177 | | - "rawType": "object", |
178 | | - "type": "unknown" |
179 | | - }, |
180 | | - { |
181 | | - "name": "avg_reward", |
182 | | - "rawType": "float64", |
183 | | - "type": "float" |
184 | | - }, |
185 | | - { |
186 | | - "name": "std_err", |
187 | | - "rawType": "float64", |
188 | | - "type": "float" |
189 | | - }, |
190 | | - { |
191 | | - "name": "avg_steps", |
192 | | - "rawType": "float64", |
193 | | - "type": "float" |
194 | | - }, |
195 | | - { |
196 | | - "name": "n_completed", |
197 | | - "rawType": "object", |
198 | | - "type": "string" |
199 | | - }, |
200 | | - { |
201 | | - "name": "n_err", |
202 | | - "rawType": "int64", |
203 | | - "type": "integer" |
204 | | - } |
205 | | - ], |
206 | | - "ref": "ea68795e-a1d8-404e-9e36-1061d8fa9e87", |
207 | | - "rows": [ |
208 | | - [ |
209 | | - "('HITL-GenericAgent-gpt-5-mini-2025-08-07', 'workarena')", |
210 | | - null, |
211 | | - null, |
212 | | - null, |
213 | | - "0/1", |
214 | | - "0" |
215 | | - ] |
216 | | - ], |
217 | | - "shape": { |
218 | | - "columns": 5, |
219 | | - "rows": 1 |
220 | | - } |
221 | | - }, |
222 | | - "text/html": [ |
223 | | - "<div>\n", |
224 | | - "<style scoped>\n", |
225 | | - " .dataframe tbody tr th:only-of-type {\n", |
226 | | - " vertical-align: middle;\n", |
227 | | - " }\n", |
228 | | - "\n", |
229 | | - " .dataframe tbody tr th {\n", |
230 | | - " vertical-align: top;\n", |
231 | | - " }\n", |
232 | | - "\n", |
233 | | - " .dataframe thead th {\n", |
234 | | - " text-align: right;\n", |
235 | | - " }\n", |
236 | | - "</style>\n", |
237 | | - "<table border=\"1\" class=\"dataframe\">\n", |
238 | | - " <thead>\n", |
239 | | - " <tr style=\"text-align: right;\">\n", |
240 | | - " <th></th>\n", |
241 | | - " <th></th>\n", |
242 | | - " <th>avg_reward</th>\n", |
243 | | - " <th>std_err</th>\n", |
244 | | - " <th>avg_steps</th>\n", |
245 | | - " <th>n_completed</th>\n", |
246 | | - " <th>n_err</th>\n", |
247 | | - " </tr>\n", |
248 | | - " <tr>\n", |
249 | | - " <th>agent.agent_name</th>\n", |
250 | | - " <th>env.benchmark</th>\n", |
251 | | - " <th></th>\n", |
252 | | - " <th></th>\n", |
253 | | - " <th></th>\n", |
254 | | - " <th></th>\n", |
255 | | - " <th></th>\n", |
256 | | - " </tr>\n", |
257 | | - " </thead>\n", |
258 | | - " <tbody>\n", |
259 | | - " <tr>\n", |
260 | | - " <th>HITL-GenericAgent-gpt-5-mini-2025-08-07</th>\n", |
261 | | - " <th>workarena</th>\n", |
262 | | - " <td>NaN</td>\n", |
263 | | - " <td>NaN</td>\n", |
264 | | - " <td>NaN</td>\n", |
265 | | - " <td>0/1</td>\n", |
266 | | - " <td>0</td>\n", |
267 | | - " </tr>\n", |
268 | | - " </tbody>\n", |
269 | | - "</table>\n", |
270 | | - "</div>" |
271 | | - ], |
272 | | - "text/plain": [ |
273 | | - " avg_reward std_err \\\n", |
274 | | - "agent.agent_name env.benchmark \n", |
275 | | - "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena NaN NaN \n", |
276 | | - "\n", |
277 | | - " avg_steps n_completed \\\n", |
278 | | - "agent.agent_name env.benchmark \n", |
279 | | - "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena NaN 0/1 \n", |
280 | | - "\n", |
281 | | - " n_err \n", |
282 | | - "agent.agent_name env.benchmark \n", |
283 | | - "HITL-GenericAgent-gpt-5-mini-2025-08-07 workarena 0 " |
284 | | - ] |
285 | | - }, |
286 | | - "execution_count": 4, |
287 | | - "metadata": {}, |
288 | | - "output_type": "execute_result" |
289 | | - } |
290 | | - ], |
291 | | - "source": [ |
292 | | - "\n" |
293 | | - ] |
294 | | - }, |
295 | 51 | { |
296 | 52 | "cell_type": "markdown", |
297 | 53 | "id": "385559d7", |
|
393 | 149 | ], |
394 | 150 | "metadata": { |
395 | 151 | "kernelspec": { |
396 | | - "display_name": "agentlab", |
| 152 | + "display_name": "AgentLab", |
397 | 153 | "language": "python", |
398 | 154 | "name": "python3" |
399 | 155 | }, |
|
407 | 163 | "name": "python", |
408 | 164 | "nbconvert_exporter": "python", |
409 | 165 | "pygments_lexer": "ipython3", |
410 | | - "version": "3.12.9" |
| 166 | + "version": "3.12.7" |
411 | 167 | } |
412 | 168 | }, |
413 | 169 | "nbformat": 4, |
|
0 commit comments