enable reprompt tool use agent from controller

patricebechard · patricebechard · commit fcffc2e0d28a · 2025-07-21T16:14:01.000-04:00
diff --git a/src/agentlab/agents/tool_use_agent/hint_db.csv b/src/agentlab/agents/tool_use_agent/hint_db.csv
@@ -16,4 +16,26 @@ June 11,miniwob.drag-items,30,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7
 June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Shape and letters size comparison in miniwob,"Shapes or items have different colors and different size. Size is relative to the other objects in the white area and is either ""large"" or ""small"". Shapes that are larger than the average shape or letter are considered ""large"". Others are ""small""."
 June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,communicate answer in miniwob,Answer by clicking one of the buttons describing multiple choices.
 June 18,miniwob.count-shape,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Simbols of colors in miniwob,"Colors a distinct in this task, e.g., cyan is not a type of blue. "
-June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task.
+June 18,miniwob.form-sequence-2,23,claude-3-7-sonnet-20250219,MultiToolUse-claude-3-7-sonnet-20250219,miniwob,miniwob,allac,Reporting results in miniwob,Make sure to click submit to finish the task.
+July 16,workarena.servicenow.sort-asset-list,406,gpt-4-1,ToolUseAgent-gpt-4-1,workarena,workarena,patricebechard,Sorting lists in ServiceNow,"1. **Navigate to Your Table/List**
+
+   * For example, go to **Incident > All** or any other table you want to view.
+
+2. **Sort by One or Multiple Columns**
+
+   * `click` on the ""show / hide filter"" button (funnel icon) at the top left of the page to open the filter row.
+   * Repeat the following steps for each column you want to sort by:
+     * `click` on the ""Add Sort"" button to add a new sort filter. This will create a new ordering filter row with two comboboxes under the heading ""Order results by the following fields"".
+     * `fill` the first combobox with the appropriate field name you want to sort by. MAKE SURE to use the exact field name provided.
+     * `press` Enter after typing the field name. It is VERY IMPORTANT that you do this before doing anything else. DO NOT click on the run filter button before having confirmed your choice by explicitly pressing ENTER.
+     * `select_option` for the appropriate ordering between ascending (a to z) or descending (z to a) in the second combobox.
+   * Once all sort filters have been added, `click` the ""Run filter"" button to apply the sort.
+
+Notes:
+   * NEVER directly sort the columns using the table header.
+   * NEVER add columns via the Personalize List menu.
+
+3. **Resetting or Clearing Sorting**
+
+   * To reset sorting, click another column, or click again to toggle.
+   * In the filter bar, you may see a ""Sorted by..."" indicator—clear or change it as needed."
diff --git a/src/agentlab/agents/tool_use_agent/tool_use_agent.py b/src/agentlab/agents/tool_use_agent/tool_use_agent.py
@@ -8,22 +8,14 @@
 
 import bgym
 import pandas as pd
-from bgym import Benchmark as BgymBenchmark
-from browsergym.core.observation import extract_screenshot
-from browsergym.utils.obs import (
-    flatten_axtree_to_str,
-    flatten_dom_to_str,
-    overlay_som,
-    prune_html,
-)
-
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.benchmarks.abstract_env import AbstractBenchmark as AgentLabBenchmark
 from agentlab.benchmarks.osworld import OSWorldActionSet
 from agentlab.llm.base_api import BaseModelArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
 from agentlab.llm.response_api import (
     APIPayload,
+    AzureOpenAIResponseModelArgs,
     ClaudeResponseModelArgs,
     LLMOutput,
     MessageBuilder,
@@ -33,6 +25,14 @@
     ToolCalls,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
+from bgym import Benchmark as BgymBenchmark
+from browsergym.core.observation import extract_screenshot
+from browsergym.utils.obs import (
+    flatten_axtree_to_str,
+    flatten_dom_to_str,
+    overlay_som,
+    prune_html,
+)
 
 
 @dataclass
@@ -43,8 +43,8 @@ def _init(self):
 
     def make(self) -> "Block":
         """Returns a copy so the init can start adding some stuff to `self` without changing the
-        original datatclass that should only contain a config.
-        The aim is avoid having 2 calss definition for each block, e.g. Block and BlockArgs.
+        original dataclass that should only contain a config.
+        The aim is avoid having 2 class definitions for each block, e.g. Block and BlockArgs.
 
         Returns:
             Block: A copy of the current block instance with initialization applied.
@@ -387,7 +387,6 @@ def __init__(
             self.config.action_subsets, multiaction=self.config.multiaction  # type: ignore
         )
         self.tools = self.action_set.to_tool_description(api=model_args.api)
-
         self.call_ids = []
 
         self.llm = model_args.make_model()
@@ -595,8 +594,8 @@ def get_action(self, obs: Any) -> float:
     task_hint=TaskHint(use_task_hint=True),
     keep_last_n_obs=None,
     multiaction=True,  # whether to use multi-action or not
-    # action_subsets=("bid",),
-    action_subsets=("coord"),
+    action_subsets=("bid",),
+    # action_subsets=("coord"),
     # action_subsets=("coord", "bid"),
 )
 
diff --git a/src/agentlab/analyze/agent_controller.py b/src/agentlab/analyze/agent_controller.py
@@ -25,7 +25,7 @@
 )
 from agentlab.experiments.exp_utils import RESULTS_DIR
 from agentlab.experiments.loop import ExpArgs, StepInfo, save_package_versions
-from agentlab.llm.llm_utils import Discussion
+from agentlab.llm.response_api import LLMOutput
 from bgym import DEFAULT_BENCHMARKS
 from dotenv import load_dotenv
 from transformers import AutoTokenizer
@@ -188,7 +188,12 @@ def step_agent_history(action, action_info):
     st.session_state.action_history.append(action)
     st.session_state.action_info_history.append(action_info)
     st.session_state.thought_history.append(action_info.think)
-    st.session_state.prompt_history.append(get_prompt(action_info))
+    if isinstance(st.session_state.agent, GenericAgent):
+        st.session_state.prompt_history.append(get_prompt(action_info))
+    elif isinstance(st.session_state.agent, ToolUseAgent):
+        st.session_state.prompt_history.append(
+            "\n".join([elem.to_markdown() for elem in st.session_state.agent.discussion.flatten()])
+        )
 
     # HACK: memory history can only be obtained via the agent
     if isinstance(st.session_state.agent, GenericAgent):
@@ -229,10 +234,31 @@ def revert_agent_history():
 
 def revert_agent_state():
     logger.info("Reverting agent state")
-    st.session_state.agent.obs_history.pop()
-    st.session_state.agent.actions.pop()
-    st.session_state.agent.thoughts.pop()
-    st.session_state.agent.memories.pop()
+    if isinstance(st.session_state.agent, GenericAgent):
+        st.session_state.agent.obs_history.pop()
+        st.session_state.agent.actions.pop()
+        st.session_state.agent.thoughts.pop()
+        st.session_state.agent.memories.pop()
+    elif isinstance(st.session_state.agent, ToolUseAgent):
+        num_groups = len(st.session_state.agent.discussion.groups)
+        if num_groups == 3:
+            # start from blank state
+            st.session_state.agent.discussion.groups = []
+            st.session_state.agent.last_response = LLMOutput()
+            st.session_state.agent._responses = []
+        elif num_groups > 3:
+            # get rid of the last group (last action), and remove everything from the other previous group except for the action
+            st.session_state.agent.discussion.groups.pop()
+            last_group = copy.deepcopy(st.session_state.agent.discussion.groups[-1])
+            last_group.summary = None
+            last_group.messages = last_group.messages[:0]  # remove everything from last group
+            st.session_state.agent.discussion.groups[-1] = last_group
+            st.session_state.agent._responses.pop()
+            st.session_state.agent.last_response = copy.deepcopy(
+                st.session_state.agent._responses[-1]
+            )
+        else:
+            raise Exception("Invalid number of groups")
 
 
 def restore_env_history(step: int):
@@ -534,9 +560,17 @@ def load_session(exp_files):
         st.session_state.action_history.append(step_info.action)
         st.session_state.action_info_history.append(step_info.agent_info)
         st.session_state.thought_history.append(step_info.agent_info.get("think", None))
-        st.session_state.prompt_history.append(get_prompt(step_info.agent_info))
         if isinstance(st.session_state.agent, GenericAgent):
             st.session_state.memory_history.append(step_info.agent_info.get("memory", None))
+            st.session_state.prompt_history.append(get_prompt(step_info.agent_info))
+        elif isinstance(st.session_state.agent, ToolUseAgent):
+            st.session_state.prompt_history.append(
+                "\n".join(
+                    [elem.to_markdown() for elem in st.session_state.agent.discussion.flatten()]
+                )
+            )
+        else:
+            raise ValueError(f"Unknown agent type: {type(st.session_state.agent)}")
         st.session_state.obs_history.append(step_info.obs)
         st.session_state.reward_history.append(step_info.reward)
         st.session_state.terminated_history.append(step_info.terminated)
@@ -573,7 +607,8 @@ def clean_session():
 def prepare_agent():
     st.session_state.agent_args.prepare()
     st.session_state.agent = st.session_state.agent_args.make_agent()
-    st.session_state.agent.set_task_name(st.session_state.task)
+    if isinstance(st.session_state.agent, ToolUseAgent):
+        st.session_state.agent.set_task_name(st.session_state.task)
 
 
 def set_environment_info():
@@ -863,9 +898,9 @@ def set_prompt_modifier():
             st.session_state.agent.config.obs.use_tabs = st.checkbox(
                 "Use tabs", value=st.session_state.agent.config.obs.use_tabs
             )
-            st.session_state.agent.config.obs.add_mouse_pointer = st.checkbox(
-                "Add mouse pointer", value=st.session_state.agent.config.obs.add_mouse_pointer
-            )
+            # st.session_state.agent.config.obs.add_mouse_pointer = st.checkbox(
+            #     "Add mouse pointer", value=st.session_state.agent.config.obs.add_mouse_pointer
+            # )
             st.session_state.agent.config.obs.use_zoomed_webpage = st.checkbox(
                 "Use zoomed webpage", value=st.session_state.agent.config.obs.use_zoomed_webpage
             )
@@ -1107,7 +1142,14 @@ def set_axtree_tab():
 
 
 def set_prompt_tab():
-    st.code(st.session_state.prompt_history[-1], language=None, wrap_lines=True)
+    if isinstance(st.session_state.agent, GenericAgent):
+        st.code(st.session_state.prompt_history[-1], language=None, wrap_lines=True)
+    elif isinstance(st.session_state.agent, ToolUseAgent):
+        st.markdown(st.session_state.prompt_history[-1])
+
+        st.markdown(f"## Last summary:\n{st.session_state.agent.discussion.get_last_summary()}")
+    else:
+        raise ValueError(f"Unknown agent type: {type(st.session_state.agent)}")
 
 
 def set_previous_steps_tab():
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
@@ -589,29 +589,26 @@ class AzureOpenAIResponseModel(OpenAIResponseModel):
     def __init__(
         self,
         model_name: str,
+        base_url: Optional[str] = None,
         api_key: Optional[str] = None,
-        temperature: float = 0.5,
-        max_tokens: int = 100,
-        extra_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
+        temperature: float | None = None,
+        max_tokens: int | None = 100,
     ):
         api_key = os.getenv("AZURE_OPENAI_API_KEY")
-        self.tools = kwargs.pop("tools", None)
-        logging.info(f"Tools: {self.tools}")
-        super().__init__(
-            model_name=model_name,
-            api_key=api_key,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            extra_kwargs=extra_kwargs,
-            **kwargs,
-        )
-        # azure client takes extra kwargs
-        self.client = OpenAI(
-            api_key=api_key,
-            base_url=urljoin(os.getenv("AZURE_OPENAI_ENDPOINT"), "openai/v1"),
-            default_query={"api-version": "preview"},
+        base_url = urljoin(os.getenv("AZURE_OPENAI_ENDPOINT"), "openai/v1")
+        self.action_space_as_tools = True  # this should be a config
+        super().__init__(  # This is passed to BaseModel
+            model_name=model_name, api_key=api_key, temperature=temperature, max_tokens=max_tokens
         )
+        client_args = {}
+        if base_url is not None:
+            client_args["base_url"] = base_url
+        if api_key is not None:
+            client_args["api_key"] = api_key
+        client_args["default_query"] = {"api-version": "preview"}
+        self.client = OpenAI(**client_args)
+        # Init pricing tracker after super() so that all attributes have been set.
+        self.init_pricing_tracker(pricing_api="openai")  # Use the PricingMixin
 
 
 class OpenAIChatCompletionModel(BaseModelWithPricing):
@@ -958,9 +955,6 @@ def make_model(self, extra_kwargs=None, **kwargs):
             model_name=self.model_name,
             temperature=self.temperature,
             max_tokens=self.max_new_tokens,
-            extra_kwargs=extra_kwargs,
-            pricing_api="openai",
-            **kwargs,
         )