Skip to content

Commit 13eec41

Browse files
committed
tapes browser ui
1 parent e6ebfd8 commit 13eec41

File tree

2 files changed

+216
-1
lines changed

2 files changed

+216
-1
lines changed

src/agentlab/agents/tapeagent/agent.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,16 @@
1313
logger.setLevel(logging.INFO)
1414

1515

16+
class ExtendedMetadata(TapeMetadata):
17+
name: str = ""
18+
task: dict = {}
19+
terminated: bool = False
20+
truncated: bool = False
21+
reward: float = 0.0
22+
attempt_number: int = 0
23+
other: dict = {}
24+
25+
1626
@dataclass
1727
class TapeAgentArgs(AgentArgs):
1828
agent_name: str
@@ -77,5 +87,5 @@ def get_action(self, obs: Observation | list[Observation]) -> tuple[str, TapeAge
7787

7888
@property
7989
def final_tape(self) -> Tape:
80-
self.tape.metadata = TapeMetadata(author=self.agent.name)
90+
self.tape.metadata = ExtendedMetadata(author=self.agent.name)
8191
return self.tape

src/agentlab/analyze/tapes.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
import json
2+
import logging
3+
import sys
4+
from collections import defaultdict
5+
from pathlib import Path
6+
7+
import yaml
8+
from tapeagents.core import Step, StepMetadata, Tape
9+
from tapeagents.renderers.camera_ready_renderer import CameraReadyRenderer
10+
from tapeagents.tape_browser import TapeBrowser
11+
12+
from agentlab.agents.tapeagent.agent import ExtendedMetadata
13+
14+
logger = logging.getLogger(__name__)
15+
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
16+
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
17+
18+
19+
class WrapperStep(Step):
20+
content: dict
21+
22+
23+
class TapesRender(CameraReadyRenderer):
24+
25+
@property
26+
def style(self):
27+
style = "<style>.thought {{ background-color: #ffffba !important; }};</style>"
28+
return super().style + style
29+
30+
def render_step(self, step: WrapperStep, index: int, **kwargs):
31+
step_dict = step.content.copy()
32+
step_dict.pop("metadata", None)
33+
kind = step_dict.pop("kind", "Step")
34+
# remove empty keys
35+
step_dict = {k: v for k, v in step_dict.items() if v is not None and v != ""}
36+
if len(step_dict) == 1:
37+
content = list(step_dict.values())[0]
38+
elif kind == "page_observation":
39+
content = step_dict["text"]
40+
if len(content) > 100:
41+
summary = content[:100]
42+
content = f"<details><summary>{summary}</summary>---<br>{content}</details>"
43+
elif kind == "python_code_action":
44+
content = step_dict["code"]
45+
elif kind == "code_execution_result":
46+
content = yaml.dump(step_dict["result"], sort_keys=False, indent=2)
47+
else:
48+
content = yaml.dump(step_dict, sort_keys=False, indent=2) if step_dict else ""
49+
50+
if kind.endswith("thought"):
51+
class_ = "thought"
52+
kind = kind[:-8]
53+
elif kind.endswith("action"):
54+
class_ = "action"
55+
kind = kind[:-7]
56+
else:
57+
class_ = "observation"
58+
return (
59+
f"<div class='basic-renderer-box {class_}'>"
60+
f"<h4 class='step-header'>{kind}</h4>"
61+
f"<pre class='step-text'>{content}</pre>"
62+
f"</div>"
63+
)
64+
65+
66+
class TapesBrowser(TapeBrowser):
67+
def __init__(self, tapes_folder):
68+
super().__init__(Tape, tapes_folder, TapesRender(), ".json")
69+
70+
def get_tape_files(self) -> list[str]:
71+
logger.info(f"Searching for tapes in {self.tapes_folder}")
72+
fpath = Path(self.tapes_folder)
73+
exps = [
74+
str(exp_dir.relative_to(fpath))
75+
for exp_dir in fpath.iterdir()
76+
if exp_dir.is_dir() and len(list(exp_dir.rglob("tape.json"))) > 0
77+
]
78+
assert exps, f"No experiments found in {self.tapes_folder}"
79+
logger.info(f"Found {len(exps)} experiments in {self.tapes_folder}")
80+
return sorted(exps)
81+
82+
def get_steps(self, tape) -> list:
83+
return tape["steps"]
84+
85+
def load_llm_calls(self):
86+
pass
87+
88+
def get_context(self, tape: Tape) -> list:
89+
return []
90+
91+
def get_tape_name(self, i: int, tape: Tape) -> str:
92+
return tape[0].content["content"][:32] + "..."
93+
94+
def get_exp_label(self, filename: str, tapes: list[Tape]) -> str:
95+
acc, n_solved = 0, 0 # calculate_accuracy(tapes)
96+
errors = defaultdict(int)
97+
prompt_tokens_num = 0
98+
output_tokens_num = 0
99+
total_cost = 0.0
100+
visible_prompt_tokens_num = 0
101+
visible_output_tokens_num = 0
102+
visible_cost = 0.0
103+
no_result = 0
104+
actions = defaultdict(int)
105+
for llm_call in self.llm_calls.values():
106+
prompt_tokens_num += llm_call.prompt_length_tokens
107+
output_tokens_num += llm_call.output_length_tokens
108+
total_cost += llm_call.cost
109+
for tape in tapes:
110+
if tape.metadata.result in ["", None, "None"]:
111+
no_result += 1
112+
if tape.metadata.error:
113+
errors["fatal"] += 1
114+
last_action = None
115+
counted = set([])
116+
for step in tape:
117+
step_dict = step.content.copy()
118+
kind = step_dict.get("kind", "unknown")
119+
llm_call = self.llm_calls.get(step.metadata.prompt_id)
120+
if llm_call and step.metadata.prompt_id not in counted:
121+
counted.add(step.metadata.prompt_id)
122+
visible_prompt_tokens_num += llm_call.prompt_length_tokens
123+
visible_output_tokens_num += llm_call.output_length_tokens
124+
visible_cost += llm_call.cost
125+
if kind.endswith("action"):
126+
actions[kind] += 1
127+
last_action = kind
128+
if kind == "search_results_observation" and not len(step_dict["serp"]):
129+
errors["search_empty"] += 1
130+
if kind == "page_observation" and step_dict["error"]:
131+
errors["browser"] += 1
132+
elif kind == "llm_output_parsing_failure_action":
133+
errors["parsing"] += 1
134+
elif kind == "action_execution_failure":
135+
if last_action:
136+
errors[f"{last_action}"] += 1
137+
else:
138+
errors["unknown_action_execution_failure"] += 1
139+
elif kind == "code_execution_result" and step_dict["result"]["exit_code"]:
140+
errors["code_execution"] += 1
141+
timers, timer_counts = self.aggregate_timer_times(tapes)
142+
html = f"<h2>Solved {acc:.2f}%, {n_solved} out of {len(tapes)}</h2>"
143+
if "all" in filename:
144+
html += f"Prompt tokens: {prompt_tokens_num}<br>Output tokens: {output_tokens_num}<br>Cost: {total_cost:.2f} USD<h3>Visible</h3>"
145+
html += f"Prompt tokens: {visible_prompt_tokens_num}<br>Output tokens: {visible_output_tokens_num}<br>Cost: {visible_cost:.2f} USD"
146+
if errors:
147+
errors_str = "<br>".join(f"{k}: {v}" for k, v in errors.items())
148+
html += f"<h2>No result: {no_result}</h2>"
149+
html += f"<h2>Errors: {sum(errors.values())}</h2>{errors_str}"
150+
if actions:
151+
actions_str = "<br>".join(f"{k}: {v}" for k, v in actions.items())
152+
html += f"<h2>Actions: {sum(actions.values())}</h2>{actions_str}"
153+
if timers:
154+
timers_str = "<br>".join(
155+
f"{'execute ' if k.endswith('action') else ''}{k}: {v:.1f} sec, avg. {v/timer_counts[k]:.1f} sec"
156+
for k, v in timers.items()
157+
)
158+
html += f"<h2>Timings</h2>{timers_str}"
159+
return html
160+
161+
def aggregate_timer_times(self, tapes: list[Tape]):
162+
timer_sums = defaultdict(float)
163+
timer_counts = defaultdict(int)
164+
for tape in tapes:
165+
timers = tape.metadata.other.get("timers", {})
166+
for timer_name, exec_time in timers.items():
167+
timer_sums[timer_name] += exec_time
168+
timer_counts[timer_name] += 1
169+
for step in tape.steps:
170+
action_kind = step.metadata.other.get("action_kind")
171+
action_execution_time = step.metadata.other.get("action_execution_time")
172+
if action_kind and action_execution_time:
173+
timer_sums[action_kind] += action_execution_time
174+
timer_counts[action_kind] += 1
175+
return dict(timer_sums), dict(timer_counts)
176+
177+
def load_tapes(self, exp_dir: str) -> list[dict]:
178+
tape_dicts = []
179+
fpath = Path(self.tapes_folder) / exp_dir
180+
for json_file in fpath.rglob("tape.json"):
181+
if json_file.stat().st_size == 0:
182+
logger.warning(f"Empty tape file: {json_file}")
183+
continue
184+
try:
185+
with open(json_file) as f:
186+
tape_dict = json.load(f)
187+
tape = Tape(steps=[], metadata=ExtendedMetadata(**tape_dict["metadata"]))
188+
tape.steps = [
189+
WrapperStep(content=s, metadata=StepMetadata(**s["metadata"]))
190+
for s in tape_dict["steps"]
191+
]
192+
tape_dicts.append(tape)
193+
except Exception as e:
194+
logger.warning(f"Failed to load {json_file}: {e}")
195+
logger.info(f"Loaded {len(tape_dicts)} tapes from {exp_dir}")
196+
return tape_dicts
197+
198+
def save_annotation(self, step: int, annotation: str, tape_id: int):
199+
pass
200+
201+
202+
if __name__ == "__main__":
203+
results_dir = sys.argv[1] if len(sys.argv) > 1 else "~/agentlab_results/"
204+
tapes_browser = TapesBrowser(Path(results_dir).expanduser())
205+
tapes_browser.launch()

0 commit comments

Comments
 (0)