ServiceNow
diff --git a/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions b/‎main_workarena_debug.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 93 additions & 49 deletions b/‎src/agentlab/agents/tool_use_agent/tool_use_agent.py‎
Lines changed: 93 additions & 49 deletions
diff --git a/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 5 additions & 1 deletion b/‎src/agentlab/analyze/agent_xray.py‎
Lines changed: 5 additions & 1 deletion
@@ -0,0 +1,77 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+from copy import deepcopy
+
+import bgym
+
+from agentlab.agents.tool_use_agent.tool_use_agent import (
+    DEFAULT_PROMPT_CONFIG,
+    GPT_4_1,
+    ToolUseAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+config = deepcopy(DEFAULT_PROMPT_CONFIG)
+# config.keep_last_n_obs = 1
+config.obs.use_som = True
+
+
+agent_configs = [
+    ToolUseAgentArgs(
+        model_args=GPT_4_1,
+        config=config,
+    ),
+    # ToolUseAgentArgs(
+    #     model_args=GPT_4_1,
+    #     config=config,
+    # ),
+]
+
+for agent_config in agent_configs:
+    agent_config.config.action_subsets = ("workarena",)  # use the workarena action set
+
+
+# ## select the benchmark to run on
+# benchmark = "miniwob_tiny_test"
+benchmark = "workarena_l1"
+
+
+benchmark = bgym.DEFAULT_BENCHMARKS[benchmark](n_repeats=4)  # type: bgym.Benchmark
+benchmark = benchmark.subset_from_glob("task_name", "*create*")
+
+# for env_args in benchmark.env_args_list:
+#     print(env_args.task_name)
+#     env_args.max_steps = 15
+
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 10  # Make sure to use 1 job when debugging in VSCode
+parallel_backend = "ray"
+# parallel_backend = "sequential"  # activate sequential backend for debugging in VSCode
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_configs, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend=parallel_backend,  # "ray", "joblib" or "sequential"
+        strict_reproducibility=False,
+        n_relaunch=3,
+    )
@@ -26,11 +26,14 @@
 from agentlab.llm.base_api import BaseModelArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
+    APIPayload,
     ClaudeResponseModelArgs,
     LLMOutput,
     MessageBuilder,
     OpenAIChatModelArgs,
     OpenAIResponseModelArgs,
+    OpenRouterModelArgs,
+    ToolCalls,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
 
@@ -101,7 +104,8 @@ def flatten(self) -> list[MessageBuilder]:
                 messages.extend(group.messages)
             # Mark all summarized messages for caching
             if i == len(self.groups) - keep_last_n_obs:
-                messages[i].mark_all_previous_msg_for_caching()
+                if not isinstance(messages[i], ToolCalls):
+                    messages[i].mark_all_previous_msg_for_caching()
         return messages
 
     def set_last_summary(self, summary: MessageBuilder):
@@ -130,8 +134,10 @@ class Goal(Block):
 
     goal_as_system_msg: bool = True
 
-    def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
-        system_message = llm.msg.system().add_text(SYS_MSG)
+    def apply(
+        self, llm, discussion: StructuredDiscussion, obs: dict, sys_msg: str = SYS_MSG
+    ) -> dict:
+        system_message = llm.msg.system().add_text(sys_msg)
         discussion.append(system_message)
 
         if self.goal_as_system_msg:
@@ -164,18 +170,16 @@ class Obs(Block):
     use_dom: bool = False
     use_som: bool = False
     use_tabs: bool = False
-    add_mouse_pointer: bool = False
+    # add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
     skip_preprocessing: bool = False
 
     def apply(
         self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
     ) -> dict:
-        if last_llm_output.tool_calls is None:
-            obs_msg = llm.msg.user()  # type: MessageBuilder
-        else:
-            obs_msg = llm.msg.tool(last_llm_output.raw_response)  # type: MessageBuilder
 
+        obs_msg = llm.msg.user()
+        tool_calls = last_llm_output.tool_calls
         if self.use_last_error:
             if obs["last_action_error"] != "":
                 obs_msg.add_text(f"Last action error:\n{obs['last_action_error']}")
@@ -186,13 +190,12 @@ def apply(
             else:
                 screenshot = obs["screenshot"]
 
-            if self.add_mouse_pointer:
-                # TODO this mouse pointer should be added at the browsergym level
-                screenshot = np.array(
-                    agent_utils.add_mouse_pointer_from_action(
-                        Image.fromarray(obs["screenshot"]), obs["last_action"]
-                    )
-                )
+            # if self.add_mouse_pointer:
+            #     screenshot = np.array(
+            #         agent_utils.add_mouse_pointer_from_action(
+            #             Image.fromarray(obs["screenshot"]), obs["last_action"]
+            #         )
+            #     )
 
             obs_msg.add_image(image_to_png_base64_url(screenshot))
         if self.use_axtree:
@@ -203,6 +206,13 @@ def apply(
             obs_msg.add_text(_format_tabs(obs))
 
         discussion.append(obs_msg)
+
+        if tool_calls:
+            for call in tool_calls:
+                call.response_text("See Observation")
+            tool_response = llm.msg.add_responded_tool_calls(tool_calls)
+            discussion.append(tool_response)
+
         return obs_msg
 
 
@@ -253,8 +263,8 @@ def apply(self, llm, discussion: StructuredDiscussion) -> dict:
         msg = llm.msg.user().add_text("""Summarize\n""")
 
         discussion.append(msg)
-        # TODO need to make sure we don't force tool use here
-        summary_response = llm(messages=discussion.flatten(), tool_choice="none")
+
+        summary_response = llm(APIPayload(messages=discussion.flatten()))
 
         summary_msg = llm.msg.assistant().add_text(summary_response.think)
         discussion.append(summary_msg)
@@ -319,24 +329,6 @@ def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
             discussion.append(msg)
 
 
-class ToolCall(Block):
-    def __init__(self, tool_server):
-        self.tool_server = tool_server
-
-    def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
-        # build the message by adding components to obs
-        response: LLMOutput = llm(messages=self.messages)
-
-        messages.append(response.assistant_message)  # this is tool call
-
-        tool_answer = self.tool_server.call_tool(response)
-        tool_msg = llm.msg.tool()  # type: MessageBuilder
-        tool_msg.add_tool_id(response.last_computer_call_id)
-        tool_msg.update_last_raw_response(response)
-        tool_msg.add_text(str(tool_answer))
-        messages.append(tool_msg)
-
-
 @dataclass
 class PromptConfig:
     tag_screenshot: bool = True  # Whether to tag the screenshot with the last action.
@@ -401,7 +393,7 @@ def __init__(
 
         self.call_ids = []
 
-        self.llm = model_args.make_model(extra_kwargs={"tools": self.tools})
+        self.llm = model_args.make_model()
         self.msg_builder = model_args.get_message_builder()
         self.llm.msg = self.msg_builder
 
@@ -451,7 +443,13 @@ def get_action(self, obs: Any) -> float:
         self.llm.reset_stats()
         if not self.discussion.is_goal_set():
             self.discussion.new_group("goal")
-            self.config.goal.apply(self.llm, self.discussion, obs)
+
+            if self.config.multiaction:
+                sys_msg = SYS_MSG + "\nYou can take multiple actions in a single step, if needed."
+            else:
+                sys_msg = SYS_MSG + "\nYou can only take one action at a time."
+            self.config.goal.apply(self.llm, self.discussion, obs, sys_msg)
+
             self.config.summarizer.apply_init(self.llm, self.discussion)
             self.config.general_hints.apply(self.llm, self.discussion)
             self.task_hint.apply(self.llm, self.discussion, self.task_name)
@@ -464,21 +462,23 @@ def get_action(self, obs: Any) -> float:
 
         messages = self.discussion.flatten()
         response: LLMOutput = self.llm(
-            messages=messages,
-            tool_choice="any",
-            cache_tool_definition=True,
-            cache_complete_prompt=False,
-            use_cache_breakpoints=True,
+            APIPayload(
+                messages=messages,
+                tools=self.tools,  # You can update tools available tools now.
+                tool_choice="any",
+                cache_tool_definition=True,
+                cache_complete_prompt=False,
+                use_cache_breakpoints=True,
+            )
         )
-
         action = response.action
         think = response.think
         last_summary = self.discussion.get_last_summary()
         if last_summary is not None:
             think = last_summary.content[0]["text"] + "\n" + think
 
         self.discussion.new_group()
-        self.discussion.append(response.tool_calls)
+        # self.discussion.append(response.tool_calls) # No need to append tool calls anymore.
 
         self.last_response = response
         self._responses.append(response)  # may be useful for debugging
@@ -488,8 +488,11 @@ def get_action(self, obs: Any) -> float:
         tools_msg = MessageBuilder("tool_description").add_text(tools_str)
 
         # Adding these extra messages to visualize in gradio
-        messages.insert(0, tools_msg)  # insert at the beginning of the messages
-        messages.append(response.tool_calls)
+        messages.insert(0, tools_msg)  # insert at the beginning of the message
+        # This avoids the assertion error with self.llm.user().add_responded_tool_calls(tool_calls)
+        msg = self.llm.msg("tool")
+        msg.responded_tool_calls = response.tool_calls
+        messages.append(msg)
 
         agent_info = bgym.AgentInfo(
             think=think,
@@ -499,7 +502,7 @@ def get_action(self, obs: Any) -> float:
         return action, agent_info
 
 
-OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs(
+GPT_4_1 = OpenAIResponseModelArgs(
     model_name="gpt-4.1",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
@@ -535,6 +538,32 @@ def get_action(self, obs: Any) -> float:
     vision_support=True,
 )
 
+O3_RESPONSE_MODEL = OpenAIResponseModelArgs(
+    model_name="o3-2025-04-16",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,  # O3 does not support temperature
+    vision_support=True,
+)
+O3_CHATAPI_MODEL = OpenAIChatModelArgs(
+    model_name="o3-2025-04-16",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,
+    vision_support=True,
+)
+
+GPT4_1_OPENROUTER_MODEL = OpenRouterModelArgs(
+    model_name="openai/gpt-4.1",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=None,  # O3 does not support temperature
+    vision_support=True,
+)
+
 DEFAULT_PROMPT_CONFIG = PromptConfig(
     tag_screenshot=True,
     goal=Goal(goal_as_system_msg=True),
@@ -549,8 +578,8 @@ def get_action(self, obs: Any) -> float:
     summarizer=Summarizer(do_summary=True),
     general_hints=GeneralHints(use_hints=False),
     task_hint=TaskHint(use_task_hint=True),
-    keep_last_n_obs=None,  # keep only the last observation in the discussion
-    multiaction=False,  # whether to use multi-action or not
+    keep_last_n_obs=None,
+    multiaction=True,  # whether to use multi-action or not
     # action_subsets=("bid",),
     action_subsets=("coord"),
     # action_subsets=("coord", "bid"),
@@ -561,6 +590,21 @@ def get_action(self, obs: Any) -> float:
     config=DEFAULT_PROMPT_CONFIG,
 )
 
+OAI_AGENT = ToolUseAgentArgs(
+    model_args=GPT_4_1,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
+OAI_CHATAPI_AGENT = ToolUseAgentArgs(
+    model_args=O3_CHATAPI_MODEL,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
+OAI_OPENROUTER_AGENT = ToolUseAgentArgs(
+    model_args=GPT4_1_OPENROUTER_MODEL,
+    config=DEFAULT_PROMPT_CONFIG,
+)
+
 OSWORLD_CLAUDE = ToolUseAgentArgs(
     model_args=CLAUDE_MODEL_CONFIG,
     config=PromptConfig(
 
@@ -26,6 +26,7 @@
 from agentlab.llm.llm_utils import BaseMessage as AgentLabBaseMessage
 from agentlab.llm.llm_utils import Discussion
 from agentlab.llm.response_api import MessageBuilder
+from agentlab.llm.response_api import ToolCalls
 
 select_dir_instructions = "Select Experiment Directory"
 AGENT_NAME_KEY = "agent.agent_name"
@@ -673,6 +674,9 @@ def dict_to_markdown(d: dict):
         str: A markdown-formatted string representation of the dictionary.
     """
     if not isinstance(d, dict):
+        if isinstance(d, ToolCalls):
+            # ToolCalls rendered by to_markdown method.
+            return ""
         warning(f"Expected dict, got {type(d)}")
         return repr(d)
     if not d:
@@ -1164,7 +1168,7 @@ def get_directory_contents(results_dir: Path):
                 most_recent_summary = max(summary_files, key=os.path.getctime)
                 summary_df = pd.read_csv(most_recent_summary)
 
-                if len(summary_df) == 0 or summary_df["avg_reward"].isna().all():
+                if len(summary_df) == 0:
                     continue  # skip if all avg_reward are NaN
 
                 # get row with max avg_reward