Skip to content

Commit 7387922

Browse files
committed
Merge branch 'main' into osworld
2 parents d7401bf + c32400f commit 7387922

File tree

15 files changed

+534
-69
lines changed

15 files changed

+534
-69
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,5 +177,4 @@ OSWorld/
177177

178178

179179
# working files
180-
main_miniwob_debug.py
181-
main_workarena_debug.py
180+
experiments/*

src/agentlab/agents/generic_agent/__init__.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,23 @@
99
from .agent_configs import (
1010
AGENT_3_5,
1111
AGENT_8B,
12+
AGENT_37_SONNET,
13+
AGENT_CLAUDE_SONNET_35,
14+
AGENT_CLAUDE_SONNET_35_VISION,
1215
AGENT_CUSTOM,
13-
AGENT_LLAMA4_17B_INSTRUCT,
1416
AGENT_LLAMA3_70B,
17+
AGENT_LLAMA4_17B_INSTRUCT,
1518
AGENT_LLAMA31_70B,
19+
CHAT_MODEL_ARGS_DICT,
1620
RANDOM_SEARCH_AGENT,
1721
AGENT_4o,
1822
AGENT_4o_MINI,
19-
AGENT_CLAUDE_SONNET_35,
20-
AGENT_37_SONNET,
21-
AGENT_CLAUDE_SONNET_35_VISION,
22-
AGENT_4o_VISION,
2323
AGENT_4o_MINI_VISION,
24-
AGENT_o3_MINI,
24+
AGENT_4o_VISION,
2525
AGENT_o1_MINI,
26+
AGENT_o3_MINI,
27+
FLAGS_GPT_4o,
28+
GenericAgentArgs,
2629
)
2730

2831
__all__ = [
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import sys
22

3+
from agentlab.agents.tool_use_agent.tool_use_agent import *
4+
35
# for backward compatibility of unpickling
46
sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__]

src/agentlab/agents/tool_use_agent/tool_use_agent.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
150150

151151
AXTREE_NOTE = """
152152
AXTree extracts most of the interactive elements of the DOM in a tree structure. It may also contain information that is not visible in the screenshot.
153-
A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools.
153+
A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools, e.g, click(bid="a253"). Make sure to include letters and numbers in the bid.
154154
"""
155155

156156

@@ -347,7 +347,7 @@ class PromptConfig:
347347
task_hint: TaskHint = None
348348
keep_last_n_obs: int = 1
349349
multiaction: bool = False
350-
action_subsets: tuple[str] = field(default_factory=lambda: ("coord",))
350+
action_subsets: tuple[str] = None
351351

352352

353353
@dataclass
@@ -512,6 +512,15 @@ def get_action(self, obs: Any) -> float:
512512
vision_support=True,
513513
)
514514

515+
GPT_4_1_MINI = OpenAIResponseModelArgs(
516+
model_name="gpt-4.1-mini",
517+
max_total_tokens=200_000,
518+
max_input_tokens=200_000,
519+
max_new_tokens=2_000,
520+
temperature=0.1,
521+
vision_support=True,
522+
)
523+
515524
OPENAI_CHATAPI_MODEL_CONFIG = OpenAIChatModelArgs(
516525
model_name="gpt-4o-2024-08-06",
517526
max_total_tokens=200_000,

src/agentlab/analyze/agent_xray.py

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class StepId:
8383
@dataclass
8484
class Info:
8585
results_dir: Path = None # to root directory of all experiments
86-
exp_list_dir: Path = None # the path of the currently selected experiment
86+
study_dirs: Path = None # the path of the currently selected experiment
8787
result_df: pd.DataFrame = None # the raw loaded df
8888
agent_df: pd.DataFrame = None # the df filtered for selected agent
8989
tasks_df: pd.DataFrame = None # the unique tasks for selected agent
@@ -178,6 +178,8 @@ def run_gradio(results_dir: Path):
178178
agent_task_id = gr.State(value=None)
179179
step_id = gr.State(value=None)
180180

181+
hidden_key_input = gr.Textbox(visible=False, elem_id="key_capture")
182+
181183
with gr.Accordion("Help", open=False):
182184
gr.Markdown(
183185
"""\
@@ -207,6 +209,7 @@ def run_gradio(results_dir: Path):
207209
exp_dir_choice = gr.Dropdown(
208210
choices=get_directory_contents(results_dir),
209211
value=select_dir_instructions,
212+
multiselect=True,
210213
label="Experiment Directory",
211214
show_label=False,
212215
scale=6,
@@ -502,6 +505,32 @@ def run_gradio(results_dir: Path):
502505

503506
demo.load(fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice)
504507

508+
demo.load(
509+
None,
510+
None,
511+
None,
512+
js="""
513+
function() {
514+
document.addEventListener('keydown', function(e) {
515+
if ((e.key === 'ArrowLeft' || e.key === 'ArrowRight') && (e.metaKey || e.ctrlKey)) {
516+
e.preventDefault();
517+
const hiddenInput = document.querySelector('#key_capture input, #key_capture textarea');
518+
if (hiddenInput) {
519+
let event = e.key === 'ArrowLeft' ? 'Cmd+Left' : 'Cmd+Right';
520+
hiddenInput.value = event;
521+
hiddenInput.dispatchEvent(new Event('input', {bubbles: true}));
522+
}
523+
}
524+
});
525+
}
526+
""",
527+
)
528+
hidden_key_input.change(
529+
handle_key_event,
530+
inputs=[hidden_key_input, step_id],
531+
outputs=[hidden_key_input, step_id],
532+
)
533+
505534
demo.queue()
506535

507536
do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true"
@@ -511,6 +540,25 @@ def run_gradio(results_dir: Path):
511540
demo.launch(server_port=port, share=do_share)
512541

513542

543+
def handle_key_event(key_event, step_id: StepId):
544+
545+
if key_event:
546+
global info
547+
548+
# print(f"Key event: {key_event}")
549+
step = step_id.step
550+
if key_event.startswith("Cmd+Left"):
551+
step = max(0, step - 1)
552+
elif key_event.startswith("Cmd+Right"):
553+
step = min(len(info.exp_result.steps_info) - 2, step + 1)
554+
else:
555+
return gr.update()
556+
# print(f"Updating step to {step} from key event {key_event}")
557+
info.step = step
558+
step_id = StepId(episode_id=step_id.episode_id, step=step)
559+
return ("", step_id)
560+
561+
514562
def tab_select(evt: gr.SelectData):
515563
global info
516564
info.active_tab = evt.value
@@ -546,18 +594,24 @@ def get_screenshot(
546594
):
547595
if step is None:
548596
step = info.step
549-
step_info = info.exp_result.steps_info[step]
550597
try:
598+
step_info = info.exp_result.steps_info[step]
551599
is_som = som_or_not == "SOM Screenshots"
552600
img = info.exp_result.get_screenshot(step, som=is_som)
553601
if annotate:
554602
action_str = step_info.action
555603
properties = step_info.obs.get("extra_element_properties", None)
556-
action_colored = annotate_action(img, action_string=action_str, properties=properties)
604+
try:
605+
action_colored = annotate_action(
606+
img, action_string=action_str, properties=properties
607+
)
608+
except Exception as e:
609+
warning(f"Failed to annotate action: {e}")
610+
action_colored = action_str
557611
else:
558612
action_colored = None
559613
return img, action_colored
560-
except FileNotFoundError:
614+
except (FileNotFoundError, IndexError):
561615
return None, None
562616

563617

@@ -839,6 +893,10 @@ def get_episode_info(info: Info):
839893
try:
840894
env_args = info.exp_result.exp_args.env_args
841895
steps_info = info.exp_result.steps_info
896+
if info.step >= len(steps_info):
897+
info.step = len(steps_info) - 1
898+
if len(steps_info) == 0:
899+
return "No steps were taken in this episode."
842900
step_info = steps_info[info.step]
843901
try:
844902
goal = step_info.obs["goal_object"]
@@ -1040,31 +1098,29 @@ def update_global_stats():
10401098

10411099

10421100
def update_error_report():
1043-
report_files = list(info.exp_list_dir.glob("error_report*.md"))
1044-
if len(report_files) == 0:
1045-
return "No error report found"
1046-
report_files = sorted(report_files, key=os.path.getctime, reverse=True)
1047-
return report_files[0].read_text()
1101+
return inspect_results.error_report(info.result_df, max_stack_trace=3, use_log=True)
10481102

10491103

1050-
def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
1051-
if exp_dir == select_dir_instructions:
1052-
return None, None
1104+
def new_exp_dir(study_names: list, progress=gr.Progress(), just_refresh=False):
1105+
global info
10531106

1054-
exp_dir = exp_dir.split(" - ")[0]
1107+
# remove select_dir_instructions from study_names
1108+
if select_dir_instructions in study_names:
1109+
study_names.remove(select_dir_instructions)
10551110

1056-
if len(exp_dir) == 0:
1057-
info.exp_list_dir = None
1111+
if len(study_names) == 0:
10581112
return None, None
10591113

1060-
info.exp_list_dir = info.results_dir / exp_dir
1061-
info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm)
1114+
info.study_dirs = [info.results_dir / study_name.split(" - ")[0] for study_name in study_names]
1115+
info.result_df = inspect_results.load_result_df(info.study_dirs, progress_fn=progress.tqdm)
10621116
info.result_df = remove_args_from_col(info.result_df)
10631117

10641118
study_summary = inspect_results.summarize_study(info.result_df)
10651119
# save study_summary
1066-
study_summary.to_csv(info.exp_list_dir / "summary_df.csv", index=False)
1067-
agent_report = display_table(study_summary)
1120+
1121+
for study_dir in info.study_dirs:
1122+
study_summary.to_csv(study_dir / "summary_df.csv", index=False)
1123+
agent_report = display_table(study_summary)
10681124

10691125
info.agent_id_keys = agent_report.index.names
10701126
agent_report.reset_index(inplace=True)
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import os
2+
from dataclasses import dataclass
3+
from pathlib import Path
4+
5+
import pandas as pd
6+
from tqdm import tqdm
7+
8+
from agentlab.analyze import inspect_results
9+
from agentlab.experiments.exp_utils import RESULTS_DIR
10+
from agentlab.experiments.study import Study
11+
12+
13+
@dataclass
14+
class StudyInfo:
15+
study_dir: Path
16+
study: Study
17+
summary_df: pd.DataFrame
18+
should_delete: bool = False
19+
reason: str = ""
20+
21+
22+
def search_for_reasons_to_archive(result_dir: Path, min_study_size: int = 0) -> list[StudyInfo]:
23+
24+
study_info_list = []
25+
study_dirs = list(result_dir.iterdir())
26+
progress = tqdm(study_dirs, desc="Processing studies")
27+
for study_dir in progress:
28+
29+
progress.set_postfix({"study_dir": study_dir})
30+
if not study_dir.is_dir():
31+
progress.set_postfix({"status": "skipped"})
32+
continue
33+
34+
try:
35+
study = Study.load(study_dir)
36+
except Exception:
37+
study = None
38+
# get summary*.csv files and find the most recent
39+
summary_files = list(study_dir.glob("summary*.csv"))
40+
41+
if len(summary_files) != 0:
42+
most_recent_summary = max(summary_files, key=os.path.getctime)
43+
summary_df = pd.read_csv(most_recent_summary)
44+
45+
else:
46+
try:
47+
result_df = inspect_results.load_result_df(study_dir, progress_fn=None)
48+
summary_df = inspect_results.summarize_study(result_df)
49+
except Exception as e:
50+
print(f" Error processing {study_dir}: {e}")
51+
continue
52+
53+
study_info = StudyInfo(
54+
study_dir=study_dir,
55+
study=study,
56+
summary_df=summary_df,
57+
)
58+
59+
if len(study_info.summary_df) == 0:
60+
study_info.should_delete = True
61+
study_info.reason = "Empty summary DataFrame"
62+
63+
n_completed, n_total, n_err = 0, 0, 0
64+
65+
for _, row in study_info.summary_df.iterrows():
66+
n_comp, n_tot = row["n_completed"].split("/")
67+
n_completed += int(n_comp)
68+
n_total += int(n_tot)
69+
n_err += int(row.get("n_err"))
70+
71+
n_finished = n_completed - n_err
72+
73+
# print(summary_df)
74+
# print(f" {n_completed} / {n_total}, {n_err} errors")
75+
76+
if "miniwob-tiny-test" in study_dir.name:
77+
study_info.should_delete = True
78+
study_info.reason += "Miniwob tiny test\n"
79+
if n_total == 0:
80+
study_info.should_delete = True
81+
study_info.reason += "No tasks\n"
82+
if n_completed == 0:
83+
study_info.should_delete = True
84+
study_info.reason += "No tasks completed\n"
85+
if float(n_finished) / float(n_total) < 0.5:
86+
study_info.should_delete = True
87+
study_info.reason += f"Less than 50% tasks finished, n_err: {n_err}, n_total: {n_total}, n_finished: {n_finished}, n_completed: {n_completed}\n"
88+
89+
if n_total <= min_study_size:
90+
study_info.should_delete = True
91+
study_info.reason += (
92+
f"Too few tasks. n_total ({n_total}) <= min_study_size ({min_study_size})\n"
93+
)
94+
95+
study_info_list.append(study_info)
96+
return study_info_list
97+
98+
99+
if __name__ == "__main__":
100+
study_list_info = search_for_reasons_to_archive(RESULTS_DIR, min_study_size=5)
101+
archive_dir = RESULTS_DIR.parent / "archived_agentlab_results" # type: Path
102+
archive_dir.mkdir(parents=True, exist_ok=True)
103+
104+
# Uncomment the line below to prevent moving studies to archive
105+
archive_dir = None
106+
107+
for study_info in study_list_info:
108+
if not study_info.should_delete:
109+
continue
110+
111+
print(f"Study: {study_info.study_dir.name}")
112+
print(f" Reason: {study_info.reason}")
113+
print(study_info.summary_df)
114+
print()
115+
116+
if archive_dir is not None:
117+
# move to new dir
118+
new_path = archive_dir / study_info.study_dir.name
119+
study_info.study_dir.rename(new_path)
120+
# save reason in a file
121+
reason_file = new_path / "reason_to_archive.txt"
122+
reason_file.write_text(study_info.reason)

0 commit comments

Comments
 (0)