44from collections import defaultdict
55from pathlib import Path
66
7+ import numpy as np
78import yaml
8- from tapeagents .core import Step , StepMetadata , Tape
9+ from tapeagents .core import Step , StepMetadata
910from tapeagents .renderers .camera_ready_renderer import CameraReadyRenderer
1011from tapeagents .tape_browser import TapeBrowser
1112
12- from agentlab .agents .tapeagent .agent import ExtendedMetadata
13+ from agentlab .agents .tapeagent .agent import ExtendedMetadata , Tape
1314
1415logger = logging .getLogger (__name__ )
1516fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
@@ -20,6 +21,10 @@ class WrapperStep(Step):
2021 content : dict
2122
2223
24+ def pretty_yaml (data : dict ) -> str :
25+ return yaml .dump (data , sort_keys = False , indent = 2 ) if data else ""
26+
27+
2328class TapesRender (CameraReadyRenderer ):
2429
2530 @property
@@ -31,36 +36,35 @@ def render_step(self, step: WrapperStep, index: int, **kwargs):
3136 step_dict = step .content .copy ()
3237 step_dict .pop ("metadata" , None )
3338 kind = step_dict .pop ("kind" , "Step" )
39+ if kind == "set_next_node" :
40+ return ""
3441 # remove empty keys
3542 step_dict = {k : v for k , v in step_dict .items () if v is not None and v != "" }
3643 if len (step_dict ) == 1 :
3744 content = list (step_dict .values ())[0 ]
3845 elif kind == "page_observation" :
39- content = step_dict [ "text" ]
46+ content = step_dict . get ( "text" , pretty_yaml ( step_dict ))
4047 if len (content ) > 100 :
4148 summary = content [:100 ]
4249 content = f"<details><summary>{ summary } </summary>---<br>{ content } </details>"
4350 elif kind == "python_code_action" :
44- content = step_dict [ "code" ]
51+ content = step_dict . get ( "code" , pretty_yaml ( step_dict ))
4552 elif kind == "code_execution_result" :
46- content = yaml . dump (step_dict [ "result" ], sort_keys = False , indent = 2 )
53+ content = pretty_yaml (step_dict . get ( "result" ) )
4754 else :
48- content = yaml . dump (step_dict , sort_keys = False , indent = 2 ) if step_dict else ""
55+ content = pretty_yaml (step_dict )
4956
50- if kind .endswith ("thought" ):
57+ if step_dict .get ("error" ) or step_dict .get ("result" , {}).get ("exit_code" ):
58+ class_ = "error"
59+ elif kind .endswith ("thought" ):
5160 class_ = "thought"
5261 kind = kind [:- 8 ]
5362 elif kind .endswith ("action" ):
5463 class_ = "action"
5564 kind = kind [:- 7 ]
5665 else :
5766 class_ = "observation"
58- return (
59- f"<div class='basic-renderer-box { class_ } '>"
60- f"<h4 class='step-header'>{ kind } </h4>"
61- f"<pre class='step-text'>{ content } </pre>"
62- f"</div>"
63- )
67+ return f"<div class='basic-renderer-box { class_ } '><h4 class='step-header'>{ kind } </h4><pre class='step-text'>{ content } </pre></div>"
6468
6569
6670class TapesBrowser (TapeBrowser ):
@@ -89,10 +93,21 @@ def get_context(self, tape: Tape) -> list:
8993 return []
9094
9195 def get_tape_name (self , i : int , tape : Tape ) -> str :
92- return tape [0 ].content ["content" ][:32 ] + "..."
96+ errors = [
97+ bool (s .content .get ("error" , False ) or s .content .get ("result" , {}).get ("exit_code" ))
98+ for s in tape .steps
99+ ]
100+ mark = "✅ " if tape .metadata .reward > 0 else ""
101+ if any (errors ):
102+ mark = "⚠ "
103+ if tape .metadata .task .get ("file_name" ):
104+ mark += "📁 "
105+ n = f"{ tape .metadata .task .get ('Level' , '' )} .{ tape .metadata .task .get ('number' ,'' )} "
106+ name = tape [0 ].content ["content" ][:32 ] + "..."
107+ return f"{ n } { mark } { name } "
93108
94109 def get_exp_label (self , filename : str , tapes : list [Tape ]) -> str :
95- acc , n_solved = 0 , 0 # calculate_accuracy(tapes)
110+ acc , n_solved = self . calculate_accuracy (tapes )
96111 errors = defaultdict (int )
97112 prompt_tokens_num = 0
98113 output_tokens_num = 0
@@ -106,8 +121,10 @@ def get_exp_label(self, filename: str, tapes: list[Tape]) -> str:
106121 prompt_tokens_num += llm_call .prompt_length_tokens
107122 output_tokens_num += llm_call .output_length_tokens
108123 total_cost += llm_call .cost
124+ avg_steps = np .mean ([len (tape ) for tape in tapes ])
125+ std_steps = np .std ([len (tape ) for tape in tapes ])
109126 for tape in tapes :
110- if tape .metadata .result in [ "" , None , "None" ] :
127+ if not tape .metadata .terminated :
111128 no_result += 1
112129 if tape .metadata .error :
113130 errors ["fatal" ] += 1
@@ -125,9 +142,9 @@ def get_exp_label(self, filename: str, tapes: list[Tape]) -> str:
125142 if kind .endswith ("action" ):
126143 actions [kind ] += 1
127144 last_action = kind
128- if kind == "search_results_observation" and not len (step_dict [ "serp" ] ):
145+ if kind == "search_results_observation" and not len (step_dict . get ( "serp" ) ):
129146 errors ["search_empty" ] += 1
130- if kind == "page_observation" and step_dict [ "error" ] :
147+ if kind == "page_observation" and step_dict . get ( "error" ) :
131148 errors ["browser" ] += 1
132149 elif kind == "llm_output_parsing_failure_action" :
133150 errors ["parsing" ] += 1
@@ -136,13 +153,15 @@ def get_exp_label(self, filename: str, tapes: list[Tape]) -> str:
136153 errors [f"{ last_action } " ] += 1
137154 else :
138155 errors ["unknown_action_execution_failure" ] += 1
139- elif kind == "code_execution_result" and step_dict ["result" ]["exit_code" ]:
140- errors ["code_execution" ] += 1
156+ elif kind == "code_execution_result" :
157+ if step_dict .get ("result" , {}).get ("exit_code" ):
158+ errors ["code_execution" ] += 1
141159 timers , timer_counts = self .aggregate_timer_times (tapes )
142160 html = f"<h2>Solved { acc :.2f} %, { n_solved } out of { len (tapes )} </h2>"
143161 if "all" in filename :
144162 html += f"Prompt tokens: { prompt_tokens_num } <br>Output tokens: { output_tokens_num } <br>Cost: { total_cost :.2f} USD<h3>Visible</h3>"
145163 html += f"Prompt tokens: { visible_prompt_tokens_num } <br>Output tokens: { visible_output_tokens_num } <br>Cost: { visible_cost :.2f} USD"
164+ html += f"<h2>Steps per tape: { avg_steps :.1f} ± { std_steps :.1f} </h2>"
146165 if errors :
147166 errors_str = "<br>" .join (f"{ k } : { v } " for k , v in errors .items ())
148167 html += f"<h2>No result: { no_result } </h2>"
@@ -158,6 +177,11 @@ def get_exp_label(self, filename: str, tapes: list[Tape]) -> str:
158177 html += f"<h2>Timings</h2>{ timers_str } "
159178 return html
160179
180+ def calculate_accuracy (self , tapes : list [Tape ]) -> tuple [float , int ]:
181+ solved = [tape .metadata .reward for tape in tapes ]
182+ accuracy = 100 * (sum (solved ) / len (solved ) if solved else 0.0 )
183+ return accuracy , sum (solved )
184+
161185 def aggregate_timer_times (self , tapes : list [Tape ]):
162186 timer_sums = defaultdict (float )
163187 timer_counts = defaultdict (int )
@@ -175,7 +199,7 @@ def aggregate_timer_times(self, tapes: list[Tape]):
175199 return dict (timer_sums ), dict (timer_counts )
176200
177201 def load_tapes (self , exp_dir : str ) -> list [dict ]:
178- tape_dicts = []
202+ tapes : list [ Tape ] = []
179203 fpath = Path (self .tapes_folder ) / exp_dir
180204 for json_file in fpath .rglob ("tape.json" ):
181205 if json_file .stat ().st_size == 0 :
@@ -189,11 +213,14 @@ def load_tapes(self, exp_dir: str) -> list[dict]:
189213 WrapperStep (content = s , metadata = StepMetadata (** s ["metadata" ]))
190214 for s in tape_dict ["steps" ]
191215 ]
192- tape_dicts .append (tape )
216+ tapes .append (tape )
193217 except Exception as e :
194218 logger .warning (f"Failed to load { json_file } : { e } " )
195- logger .info (f"Loaded { len (tape_dicts )} tapes from { exp_dir } " )
196- return tape_dicts
219+ logger .info (f"Loaded { len (tapes )} tapes from { exp_dir } " )
220+ return sorted (
221+ tapes ,
222+ key = lambda x : f"{ x .metadata .task .get ('Level' , '' )} { x .metadata .task .get ('number' , 0 ):03d} " ,
223+ )
197224
198225 def save_annotation (self , step : int , annotation : str , tape_id : int ):
199226 pass
0 commit comments