ServiceNow
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 2 deletions b/‎.gitignore‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/agentlab/agents/generic_agent/__init__.py‎
Lines changed: 9 additions & 6 deletions b/‎src/agentlab/agents/generic_agent/__init__.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎src/agentlab/agents/tool_use_agent/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/agentlab/agents/tool_use_agent/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 11 additions & 2 deletions b/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 75 additions & 19 deletions b/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 75 additions & 19 deletions
diff --git a/‎src/agentlab/analyze/archive_studies.py‎
Lines changed: 122 additions & 0 deletions b/‎src/agentlab/analyze/archive_studies.py‎
Lines changed: 122 additions & 0 deletions
@@ -177,5 +177,4 @@ OSWorld/
 
 
 # working files
-main_miniwob_debug.py
-main_workarena_debug.py
+experiments/*
@@ -9,20 +9,23 @@
 from .agent_configs import (
     AGENT_3_5,
     AGENT_8B,
+    AGENT_37_SONNET,
+    AGENT_CLAUDE_SONNET_35,
+    AGENT_CLAUDE_SONNET_35_VISION,
     AGENT_CUSTOM,
-    AGENT_LLAMA4_17B_INSTRUCT,
     AGENT_LLAMA3_70B,
+    AGENT_LLAMA4_17B_INSTRUCT,
     AGENT_LLAMA31_70B,
+    CHAT_MODEL_ARGS_DICT,
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
     AGENT_4o_MINI,
-    AGENT_CLAUDE_SONNET_35,
-    AGENT_37_SONNET,
-    AGENT_CLAUDE_SONNET_35_VISION,
-    AGENT_4o_VISION,
     AGENT_4o_MINI_VISION,
-    AGENT_o3_MINI,
+    AGENT_4o_VISION,
     AGENT_o1_MINI,
+    AGENT_o3_MINI,
+    FLAGS_GPT_4o,
+    GenericAgentArgs,
 )
 
 __all__ = [
 
@@ -1,4 +1,6 @@
 import sys
 
+from agentlab.agents.tool_use_agent.tool_use_agent import *
+
 # for backward compatibility of unpickling
 sys.modules[__name__ + ".multi_tool_agent"] = sys.modules[__name__]
@@ -150,7 +150,7 @@ def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
 
 AXTREE_NOTE = """
 AXTree extracts most of the interactive elements of the DOM in a tree structure. It may also contain information that is not visible in the screenshot.
-A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools.
+A line starting with [bid] is a node in the AXTree. It is a unique alpha-numeric identifier to be used when calling tools, e.g, click(bid="a253"). Make sure to include letters and numbers in the bid.
 """
 
 
@@ -347,7 +347,7 @@ class PromptConfig:
     task_hint: TaskHint = None
     keep_last_n_obs: int = 1
     multiaction: bool = False
-    action_subsets: tuple[str] = field(default_factory=lambda: ("coord",))
+    action_subsets: tuple[str] = None
 
 
 @dataclass
@@ -512,6 +512,15 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
+GPT_4_1_MINI = OpenAIResponseModelArgs(
+    model_name="gpt-4.1-mini",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=0.1,
+    vision_support=True,
+)
+
 OPENAI_CHATAPI_MODEL_CONFIG = OpenAIChatModelArgs(
     model_name="gpt-4o-2024-08-06",
     max_total_tokens=200_000,
 
@@ -83,7 +83,7 @@ class StepId:
 @dataclass
 class Info:
     results_dir: Path = None  # to root directory of all experiments
-    exp_list_dir: Path = None  # the path of the currently selected experiment
+    study_dirs: Path = None  # the path of the currently selected experiment
     result_df: pd.DataFrame = None  # the raw loaded df
     agent_df: pd.DataFrame = None  # the df filtered for selected agent
     tasks_df: pd.DataFrame = None  # the unique tasks for selected agent
@@ -178,6 +178,8 @@ def run_gradio(results_dir: Path):
         agent_task_id = gr.State(value=None)
         step_id = gr.State(value=None)
 
+        hidden_key_input = gr.Textbox(visible=False, elem_id="key_capture")
+
         with gr.Accordion("Help", open=False):
             gr.Markdown(
                 """\
@@ -207,6 +209,7 @@ def run_gradio(results_dir: Path):
             exp_dir_choice = gr.Dropdown(
                 choices=get_directory_contents(results_dir),
                 value=select_dir_instructions,
+                multiselect=True,
                 label="Experiment Directory",
                 show_label=False,
                 scale=6,
@@ -502,6 +505,32 @@ def run_gradio(results_dir: Path):
 
         demo.load(fn=refresh_exp_dir_choices, inputs=exp_dir_choice, outputs=exp_dir_choice)
 
+        demo.load(
+            None,
+            None,
+            None,
+            js="""
+    function() {
+        document.addEventListener('keydown', function(e) {
+            if ((e.key === 'ArrowLeft' || e.key === 'ArrowRight') && (e.metaKey || e.ctrlKey)) {
+                e.preventDefault();
+                const hiddenInput = document.querySelector('#key_capture input, #key_capture textarea');
+                if (hiddenInput) {
+                    let event = e.key === 'ArrowLeft' ? 'Cmd+Left' : 'Cmd+Right';
+                    hiddenInput.value = event;
+                    hiddenInput.dispatchEvent(new Event('input', {bubbles: true}));
+                }
+            }
+        });
+    }
+        """,
+        )
+        hidden_key_input.change(
+            handle_key_event,
+            inputs=[hidden_key_input, step_id],
+            outputs=[hidden_key_input, step_id],
+        )
+
     demo.queue()
 
     do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", "false").lower() == "true"
@@ -511,6 +540,25 @@ def run_gradio(results_dir: Path):
     demo.launch(server_port=port, share=do_share)
 
 
+def handle_key_event(key_event, step_id: StepId):
+
+    if key_event:
+        global info
+
+        # print(f"Key event: {key_event}")
+        step = step_id.step
+        if key_event.startswith("Cmd+Left"):
+            step = max(0, step - 1)
+        elif key_event.startswith("Cmd+Right"):
+            step = min(len(info.exp_result.steps_info) - 2, step + 1)
+        else:
+            return gr.update()
+        # print(f"Updating step to {step} from key event {key_event}")
+        info.step = step
+        step_id = StepId(episode_id=step_id.episode_id, step=step)
+    return ("", step_id)
+
+
 def tab_select(evt: gr.SelectData):
     global info
     info.active_tab = evt.value
@@ -546,18 +594,24 @@ def get_screenshot(
 ):
     if step is None:
         step = info.step
-    step_info = info.exp_result.steps_info[step]
     try:
+        step_info = info.exp_result.steps_info[step]
         is_som = som_or_not == "SOM Screenshots"
         img = info.exp_result.get_screenshot(step, som=is_som)
         if annotate:
             action_str = step_info.action
             properties = step_info.obs.get("extra_element_properties", None)
-            action_colored = annotate_action(img, action_string=action_str, properties=properties)
+            try:
+                action_colored = annotate_action(
+                    img, action_string=action_str, properties=properties
+                )
+            except Exception as e:
+                warning(f"Failed to annotate action: {e}")
+                action_colored = action_str
         else:
             action_colored = None
         return img, action_colored
-    except FileNotFoundError:
+    except (FileNotFoundError, IndexError):
         return None, None
 
 
@@ -839,6 +893,10 @@ def get_episode_info(info: Info):
     try:
         env_args = info.exp_result.exp_args.env_args
         steps_info = info.exp_result.steps_info
+        if info.step >= len(steps_info):
+            info.step = len(steps_info) - 1
+        if len(steps_info) == 0:
+            return "No steps were taken in this episode."
         step_info = steps_info[info.step]
         try:
             goal = step_info.obs["goal_object"]
@@ -1040,31 +1098,29 @@ def update_global_stats():
 
 
 def update_error_report():
-    report_files = list(info.exp_list_dir.glob("error_report*.md"))
-    if len(report_files) == 0:
-        return "No error report found"
-    report_files = sorted(report_files, key=os.path.getctime, reverse=True)
-    return report_files[0].read_text()
+    return inspect_results.error_report(info.result_df, max_stack_trace=3, use_log=True)
 
 
-def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
-    if exp_dir == select_dir_instructions:
-        return None, None
+def new_exp_dir(study_names: list, progress=gr.Progress(), just_refresh=False):
+    global info
 
-    exp_dir = exp_dir.split(" - ")[0]
+    # remove select_dir_instructions from study_names
+    if select_dir_instructions in study_names:
+        study_names.remove(select_dir_instructions)
 
-    if len(exp_dir) == 0:
-        info.exp_list_dir = None
+    if len(study_names) == 0:
         return None, None
 
-    info.exp_list_dir = info.results_dir / exp_dir
-    info.result_df = inspect_results.load_result_df(info.exp_list_dir, progress_fn=progress.tqdm)
+    info.study_dirs = [info.results_dir / study_name.split(" - ")[0] for study_name in study_names]
+    info.result_df = inspect_results.load_result_df(info.study_dirs, progress_fn=progress.tqdm)
     info.result_df = remove_args_from_col(info.result_df)
 
     study_summary = inspect_results.summarize_study(info.result_df)
     # save study_summary
-    study_summary.to_csv(info.exp_list_dir / "summary_df.csv", index=False)
-    agent_report = display_table(study_summary)
+
+    for study_dir in info.study_dirs:
+        study_summary.to_csv(study_dir / "summary_df.csv", index=False)
+        agent_report = display_table(study_summary)
 
     info.agent_id_keys = agent_report.index.names
     agent_report.reset_index(inplace=True)
 
@@ -0,0 +1,122 @@
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+from tqdm import tqdm
+
+from agentlab.analyze import inspect_results
+from agentlab.experiments.exp_utils import RESULTS_DIR
+from agentlab.experiments.study import Study
+
+
+@dataclass
+class StudyInfo:
+    study_dir: Path
+    study: Study
+    summary_df: pd.DataFrame
+    should_delete: bool = False
+    reason: str = ""
+
+
+def search_for_reasons_to_archive(result_dir: Path, min_study_size: int = 0) -> list[StudyInfo]:
+
+    study_info_list = []
+    study_dirs = list(result_dir.iterdir())
+    progress = tqdm(study_dirs, desc="Processing studies")
+    for study_dir in progress:
+
+        progress.set_postfix({"study_dir": study_dir})
+        if not study_dir.is_dir():
+            progress.set_postfix({"status": "skipped"})
+            continue
+
+        try:
+            study = Study.load(study_dir)
+        except Exception:
+            study = None
+        # get summary*.csv files and find the most recent
+        summary_files = list(study_dir.glob("summary*.csv"))
+
+        if len(summary_files) != 0:
+            most_recent_summary = max(summary_files, key=os.path.getctime)
+            summary_df = pd.read_csv(most_recent_summary)
+
+        else:
+            try:
+                result_df = inspect_results.load_result_df(study_dir, progress_fn=None)
+                summary_df = inspect_results.summarize_study(result_df)
+            except Exception as e:
+                print(f"  Error processing {study_dir}: {e}")
+                continue
+
+        study_info = StudyInfo(
+            study_dir=study_dir,
+            study=study,
+            summary_df=summary_df,
+        )
+
+        if len(study_info.summary_df) == 0:
+            study_info.should_delete = True
+            study_info.reason = "Empty summary DataFrame"
+
+        n_completed, n_total, n_err = 0, 0, 0
+
+        for _, row in study_info.summary_df.iterrows():
+            n_comp, n_tot = row["n_completed"].split("/")
+            n_completed += int(n_comp)
+            n_total += int(n_tot)
+            n_err += int(row.get("n_err"))
+
+        n_finished = n_completed - n_err
+
+        # print(summary_df)
+        # print(f"  {n_completed} / {n_total}, {n_err} errors")
+
+        if "miniwob-tiny-test" in study_dir.name:
+            study_info.should_delete = True
+            study_info.reason += "Miniwob tiny test\n"
+        if n_total == 0:
+            study_info.should_delete = True
+            study_info.reason += "No tasks\n"
+        if n_completed == 0:
+            study_info.should_delete = True
+            study_info.reason += "No tasks completed\n"
+        if float(n_finished) / float(n_total) < 0.5:
+            study_info.should_delete = True
+            study_info.reason += f"Less than 50% tasks finished, n_err: {n_err}, n_total: {n_total}, n_finished: {n_finished}, n_completed: {n_completed}\n"
+
+        if n_total <= min_study_size:
+            study_info.should_delete = True
+            study_info.reason += (
+                f"Too few tasks. n_total ({n_total}) <= min_study_size ({min_study_size})\n"
+            )
+
+        study_info_list.append(study_info)
+    return study_info_list
+
+
+if __name__ == "__main__":
+    study_list_info = search_for_reasons_to_archive(RESULTS_DIR, min_study_size=5)
+    archive_dir = RESULTS_DIR.parent / "archived_agentlab_results"  # type: Path
+    archive_dir.mkdir(parents=True, exist_ok=True)
+
+    # Uncomment the line below to prevent moving studies to archive
+    archive_dir = None
+
+    for study_info in study_list_info:
+        if not study_info.should_delete:
+            continue
+
+        print(f"Study: {study_info.study_dir.name}")
+        print(f"  Reason: {study_info.reason}")
+        print(study_info.summary_df)
+        print()
+
+        if archive_dir is not None:
+            # move to new dir
+            new_path = archive_dir / study_info.study_dir.name
+            study_info.study_dir.rename(new_path)
+            # save reason in a file
+            reason_file = new_path / "reason_to_archive.txt"
+            reason_file.write_text(study_info.reason)