Merge pull request #262 from ServiceNow/deep_debug

amanjaiswal73892 · web-flow · commit d92e0bf52798 · 2025-07-11T16:58:23.000-04:00
Deep debug
diff --git a/main_workarena_debug.py b/main_workarena_debug.py
@@ -0,0 +1,77 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+from copy import deepcopy
+
+import bgym
+
+from agentlab.agents.tool_use_agent.tool_use_agent import (
+    DEFAULT_PROMPT_CONFIG,
+    GPT_4_1,
+    ToolUseAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+config = deepcopy(DEFAULT_PROMPT_CONFIG)
+# config.keep_last_n_obs = 1
+config.obs.use_som = True
+
+
+agent_configs = [
+    ToolUseAgentArgs(
+        model_args=GPT_4_1,
+        config=config,
+    ),
+    # ToolUseAgentArgs(
+    #     model_args=GPT_4_1,
+    #     config=config,
+    # ),
+]
+
+for agent_config in agent_configs:
+    agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+
+
+# ## select the benchmark to run on
+# benchmark = "miniwob_tiny_test"
+benchmark = "workarena_l1"
+
+
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4)  # type: bgym.Benchmark
+benchmark = benchmark.subset_from_glob("task_name", "*create*")
+
+# for env_args in benchmark.env_args_list:
+#     print(env_args.task_name)
+#     env_args.max_steps = 15
+
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
+parallel_backend = "ray"
+# parallel_backend = "sequential"  # activate sequential backend for debugging in VSCode
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend=parallel_backend,  # "ray", "joblib" or "sequential"
+        strict_reproducibility=False,
+        n_relaunch=3,
+    )
diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -127,8 +127,10 @@ class Goal(Block):
 
     goal_as_system_msg: bool = True
 
-    def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
-        system_message = llm.msg.system().add_text(SYS_MSG)
+    def apply(
+        self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG
+    ) -> dict:
+        system_message = llm.msg.system().add_text(sys_msg)
         discussion.append(system_message)
 
         if self.goal_as_system_msg:
@@ -441,7 +443,13 @@ def get_action(self, obs: Any) -> float:
         self.llm.reset_stats()
         if not self.discussion.is_goal_set():
             self.discussion.new_group("goal")
-            self.config.goal.apply(self.llm, self.discussion, obs)
+
+            if self.config.multiaction:
+                sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed."
+            else:
+                sys_msg = SYS_MSG + "\nYou can only take one action at a time."
+            self.config.goal.apply(self.llm, self.discussion, obs, sys_msg)
+
             self.config.summarizer.apply_init(self.llm, self.discussion)
             self.config.general_hints.apply(self.llm, self.discussion)
             self.task_hint.apply(self.llm, self.discussion, self.task_name)
@@ -489,7 +497,7 @@ def get_action(self, obs: Any) -> float:
         return action, agent_info
 
 
-OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs(
+GPT_4_1 = OpenAIResponseModelArgs(
     model_name="gpt-4.1",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -1164,7 +1164,7 @@ def get_directory_contents(results_dir: Path):
                 most_recent_summary = max(summary_files, key=os.path.getctime)
                 summary_df = pd.read_csv(most_recent_summary)
 
-                if len(summary_df) == 0 or summary_df["avg_reward"].isna().all():
+                if len(summary_df) == 0:
                     continue  # skip if all avg_reward are NaN
 
                 # get row with max avg_reward
diff --git a/src/agentlab/analyze/overlay_utils.py b/src/agentlab/analyze/overlay_utils.py
@@ -1,9 +1,11 @@
 import ast
 import inspect
+import math
 from dataclasses import dataclass
 from typing import Any, Union
 
 import matplotlib.pyplot as plt
+import PIL
 from browsergym.core.action.highlevel import ACTION_SUBSETS
 from PIL import Image, ImageDraw
 
@@ -289,17 +291,54 @@ def overlay_rectangle(
     bbox: tuple[float, float, float, float],
     color: Union[str, tuple[int, int, int]] = "red",
     width: int = 1,
+    dashed: bool = True,
 ) -> Image.Image:
     draw = ImageDraw.Draw(img)
 
     x, y, w, h = bbox
 
-    # Draw rectangle outline
-    draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
+    if dashed:
+        # Draw dashed rectangle
+        print("Drawing dashed rectangle")
+        linedashed(draw, x, y, x + w, y, color, width)
+        linedashed(draw, x + w, y, x + w, y + h, color, width)
+        linedashed(draw, x + w, y + h, x, y + h, color, width)
+        linedashed(draw, x, y + h, x, y, color, width)
+    else:
+        draw.rectangle([x, y, x + w, y + h], outline=color, width=width)
 
     return img
 
 
+# Adapted from https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306
+def linedashed(
+    draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dash_length=4, nodash_length=8
+):
+    line_dx = x1 - x0  # delta x (can be negative)
+    line_dy = y1 - y0  # delta y (can be negative)
+    line_length = math.hypot(line_dx, line_dy)  # line length (positive)
+    if line_length == 0:
+        return  # Avoid division by zero in case the line length is 0
+    pixel_dx = line_dx / line_length  # x add for 1px line length
+    pixel_dy = line_dy / line_length  # y add for 1px line length
+    dash_start = 0
+    while dash_start < line_length:
+        dash_end = dash_start + dash_length
+        if dash_end > line_length:
+            dash_end = line_length
+        draw.line(
+            (
+                round(x0 + pixel_dx * dash_start),
+                round(y0 + pixel_dy * dash_start),
+                round(x0 + pixel_dx * dash_end),
+                round(y0 + pixel_dy * dash_end),
+            ),
+            fill=fill,
+            width=width,
+        )
+        dash_start += dash_length + nodash_length
+
+
 def annotate_action(
     img: Image.Image, action_string: str, properties: dict[str, tuple], colormap: str = "tab10"
 ) -> str:
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
@@ -313,7 +313,6 @@ def __init__(
         **kwargs,
     ):
         self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
         super().__init__(
             model_name=model_name,
             api_key=api_key,
@@ -324,7 +323,9 @@ def __init__(
         )
         self.client = OpenAI(api_key=api_key)
 
-    def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
+    def _call_api(
+        self, messages: list[Any | MessageBuilder], tool_choice: str = "auto", **kwargs
+    ) -> dict:
         input = []
         for msg in messages:
             input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
@@ -339,8 +340,10 @@ def _call_api(self, messages: list[Any | MessageBuilder], **kwargs) -> dict:
 
         if self.tools is not None:
             api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
+        if tool_choice in ("any", "required"):
+            tool_choice = "required"
+
+        api_params["tool_choice"] = tool_choice
 
         # api_params |= kwargs  # Merge any additional parameters passed
         response = call_openai_api_with_retries(
@@ -388,7 +391,6 @@ def __init__(
     ):
 
         self.tools = self.format_tools_for_chat_completion(kwargs.pop("tools", None))
-        self.tool_choice = kwargs.pop("tool_choice", None)
 
         super().__init__(
             model_name=model_name,
@@ -403,7 +405,9 @@ def __init__(
             **client_args
         )  # Ensures client_args is a dict or defaults to an empty dict
 
-    def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.ChatCompletion:
+    def _call_api(
+        self, messages: list[dict | MessageBuilder], tool_choice: str = "auto"
+    ) -> openai.types.chat.ChatCompletion:
         input = []
         for msg in messages:
             input.extend(msg.prepare_message() if isinstance(msg, MessageBuilder) else [msg])
@@ -416,8 +420,10 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> openai.types.chat.
         }
         if self.tools is not None:
             api_params["tools"] = self.tools
-        if self.tool_choice is not None:
-            api_params["tool_choice"] = self.tool_choice
+
+        if tool_choice in ("any", "required"):
+            tool_choice = "required"
+        api_params["tool_choice"] = tool_choice
 
         response = call_openai_api_with_retries(self.client.chat.completions.create, api_params)
 
@@ -517,7 +523,6 @@ def __init__(
         **kwargs,
     ):
         self.tools = kwargs.pop("tools", None)
-        self.tool_choice = kwargs.pop("tool_choice", None)
 
         super().__init__(
             model_name=model_name,
@@ -543,6 +548,9 @@ def _call_api(
                 temp = self.apply_cache_breakpoints(msg, temp)
             input.extend(temp)
 
+        if tool_choice in ("any", "required"):
+            tool_choice = "any"  # Claude API expects "any" and gpt expects "required"
+
         api_params: Dict[str, Any] = {
             "model": self.model_name,
             "messages": input,