tool agent embryo

TLSDC · TLSDC · commit 73ba42843066 · 2025-04-23T14:44:20.000-04:00
diff --git a/src/agentlab/agents/tool_use_agent/__init__.py b/src/agentlab/agents/tool_use_agent/__init__.py
diff --git a/src/agentlab/agents/tool_use_agent/agent.py b/src/agentlab/agents/tool_use_agent/agent.py
@@ -0,0 +1,184 @@
+import json
+import logging
+from copy import deepcopy as copy
+from dataclasses import asdict, dataclass
+from typing import TYPE_CHECKING, Any
+
+import bgym
+from browsergym.core.observation import extract_screenshot
+
+from agentlab.agents.agent_args import AgentArgs
+from agentlab.llm.llm_utils import image_to_png_base64_url
+from agentlab.llm.response_api import OpenAIResponseModelArgs
+from agentlab.llm.tracking import cost_tracker_decorator
+
+if TYPE_CHECKING:
+    from openai.types.responses import Response
+
+
+@dataclass
+class ToolUseAgentArgs(AgentArgs):
+    temperature: float = 0.1
+    model_args: OpenAIResponseModelArgs = None
+
+    def __post_init__(self):
+        try:
+            self.agent_name = f"ToolUse-{self.model_args.model_name}".replace("/", "_")
+        except AttributeError:
+            pass
+
+    def make_agent(self) -> bgym.Agent:
+        return ToolUseAgent(
+            temperature=self.temperature,
+            model_args=self.model_args,
+        )
+
+    def set_reproducibility_mode(self):
+        self.temperature = 0
+
+    def prepare(self):
+        return self.model_args.prepare_server()
+
+    def close(self):
+        return self.model_args.close_server()
+
+
+class ToolUseAgent(bgym.Agent):
+    def __init__(
+        self,
+        temperature: float,
+        model_args: OpenAIResponseModelArgs,
+    ):
+        self.temperature = temperature
+        self.chat = model_args.make_model()
+        self.model_args = model_args
+
+        self.action_set = bgym.HighLevelActionSet(["coord"], multiaction=False)
+
+        self.tools = self.action_set.to_tool_description()
+
+        # self.tools.append(
+        #     {
+        #         "type": "function",
+        #         "name": "chain_of_thought",
+        #         "description": "A tool that allows the agent to think step by step. Every other action must ALWAYS be preceeded by a call to this tool.",
+        #         "parameters": {
+        #             "type": "object",
+        #             "properties": {
+        #                 "thoughts": {
+        #                     "type": "string",
+        #                     "description": "The agent's reasoning process.",
+        #                 },
+        #             },
+        #             "required": ["thoughts"],
+        #         },
+        #     }
+        # )
+
+        self.llm = model_args.make_model(extra_kwargs={"tools": self.tools})
+
+        self.messages = []
+
+    def obs_preprocessor(self, obs):
+        page = obs.pop("page", None)
+        if page is not None:
+            obs["screenshot"] = extract_screenshot(page)
+        else:
+            raise ValueError("No page found in the observation.")
+
+        return obs
+
+    @cost_tracker_decorator
+    def get_action(self, obs: Any) -> tuple[str, dict]:
+
+        if len(self.messages) == 0:
+            system_message = {
+                "role": "system",
+                "content": "You are an agent. Based on the observation, you will decide which action to take to accomplish your goal.",
+            }
+            goal_object = [el for el in obs["goal_object"]]
+            for content in goal_object:
+                if content["type"] == "text":
+                    content["type"] = "input_text"
+                elif content["type"] == "image_url":
+                    content["type"] = "input_image"
+            goal_message = {"role": "user", "content": goal_object}
+            goal_message["content"].append(
+                {
+                    "type": "input_image",
+                    "image_url": image_to_png_base64_url(obs["screenshot"]),
+                }
+            )
+            self.messages.append(system_message)
+            self.messages.append(goal_message)
+        else:
+            if obs["last_action_error"] == "":
+                self.messages.append(
+                    {
+                        "type": "function_call_output",
+                        "call_id": self.previous_call_id,
+                        "output": "Function call executed, see next observation.",
+                    }
+                )
+                self.messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "input_image",
+                                "image_url": image_to_png_base64_url(obs["screenshot"]),
+                            }
+                        ],
+                    }
+                )
+            else:
+                self.messages.append(
+                    {
+                        "type": "function_call_output",
+                        "call_id": self.previous_call_id,
+                        "output": f"Function call failed: {obs['last_action_error']}",
+                    }
+                )
+
+        response: "Response" = self.llm(
+            messages=self.messages,
+            temperature=self.temperature,
+        )
+
+        action = "noop()"
+        think = ""
+        for output in response.output:
+            if output.type == "function_call":
+                arguments = json.loads(output.arguments)
+                action = f"{output.name}({", ".join([f"{k}={v}" for k, v in arguments.items()])})"
+                self.previous_call_id = output.call_id
+                self.messages.append(output)
+                break
+            elif output.type == "reasoning":
+                if len(output.summary) > 0:
+                    think += output.summary[0].text + "\n"
+                self.messages.append(output)
+
+        return (
+            action,
+            bgym.AgentInfo(
+                think=think,
+                chat_messages=[],
+                stats={},
+            ),
+        )
+
+
+MODEL_CONFIG = OpenAIResponseModelArgs(
+    model_name="o4-mini-2025-04-16",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=100_000,
+    temperature=0.1,
+    vision_support=True,
+)
+
+AGENT_CONFIG = ToolUseAgentArgs(
+    temperature=0.1,
+    model_args=MODEL_CONFIG,
+)
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
@@ -0,0 +1,167 @@
+import logging
+from dataclasses import dataclass
+
+import openai
+from openai import OpenAI
+
+from .base_api import AbstractChatModel, BaseModelArgs
+
+
+class ResponseModel(AbstractChatModel):
+    def __init__(
+        self,
+        model_name,
+        api_key=None,
+        temperature=0.5,
+        max_tokens=100,
+        extra_kwargs=None,
+    ):
+        self.model_name = model_name
+        self.api_key = api_key
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.extra_kwargs = extra_kwargs or {}
+        self.client = OpenAI(api_key=api_key)
+
+    def __call__(self, content: dict, temperature: float = None) -> dict:
+        temperature = temperature if temperature is not None else self.temperature
+        try:
+            response = self.client.responses.create(
+                model=self.model_name,
+                input=content,
+                # temperature=temperature,
+                # previous_response_id=content.get("previous_response_id", None),
+                max_output_tokens=self.max_tokens,
+                **self.extra_kwargs,
+                tool_choice="required",
+                reasoning={
+                    "effort": "low",
+                    "summary": "detailed",
+                },
+            )
+            return response
+        except openai.OpenAIError as e:
+            logging.error(f"Failed to get a response from the API: {e}")
+            raise e
+
+
+class OpenAIResponseModel(ResponseModel):
+    def __init__(
+        self, model_name, api_key=None, temperature=0.5, max_tokens=100, extra_kwargs=None
+    ):
+        super().__init__(
+            model_name=model_name,
+            api_key=api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            extra_kwargs=extra_kwargs,
+        )
+
+    def __call__(self, messages: list[dict], temperature: float = None) -> dict:
+        return super().__call__(messages, temperature)
+        # outputs = response.output
+        # last_computer_call_id = None
+        # answer_type = "call"
+        # reasoning = "No reasoning"
+        # for output in outputs:
+        #     if output.type == "reasoning":
+        #         reasoning = output.summary[0].text
+        #     elif output.type == "computer_call":
+        #         action = output.action
+        #         last_computer_call_id = output.call_id
+        #         res = response_to_text(action)
+        #     elif output.type == "message":
+        #         res = "noop()"
+        #         answer_type = "message"
+        #     else:
+        #         logging.warning(f"Unrecognized output type: {output.type}")
+        #         continue
+        # return {
+        #     "think": reasoning,
+        #     "action": res,
+        #     "last_computer_call_id": last_computer_call_id,
+        #     "last_response_id": response.id,
+        #     "outputs": outputs,
+        #     "answer_type": answer_type,
+        # }
+
+
+def response_to_text(action):
+    """
+    Given a computer action (e.g., click, double_click, scroll, etc.),
+    convert it to a text description.
+    """
+    action_type = action.type
+
+    try:
+        match action_type:
+
+            case "click":
+                x, y = action.x, action.y
+                button = action.button
+                print(f"Action: click at ({x}, {y}) with button '{button}'")
+                # Not handling things like middle click, etc.
+                if button != "left" and button != "right":
+                    button = "left"
+                return f"mouse_click({x}, {y}, button='{button}')"
+
+            case "scroll":
+                x, y = action.x, action.y
+                scroll_x, scroll_y = action.scroll_x, action.scroll_y
+                print(
+                    f"Action: scroll at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})"
+                )
+                return f"mouse_move({x}, {y})\nscroll({scroll_x}, {scroll_y})"
+
+            case "keypress":
+                keys = action.keys
+                for k in keys:
+                    print(f"Action: keypress '{k}'")
+                    # A simple mapping for common keys; expand as needed.
+                    if k.lower() == "enter":
+                        return "keyboard_press('Enter')"
+                    elif k.lower() == "space":
+                        return "keyboard_press(' ')"
+                    else:
+                        return f"keyboard_press('{k}')"
+
+            case "type":
+                text = action.text
+                print(f"Action: type text: {text}")
+                return f"keyboard_type('{text}')"
+
+            case "wait":
+                print(f"Action: wait")
+                return "noop()"
+
+            case "screenshot":
+                # Nothing to do as screenshot is taken at each turn
+                print(f"Action: screenshot")
+
+            # Handle other actions here
+
+            case "drag":
+                x1, y1 = action.path[0].x, action.path[0].y
+                x2, y2 = action.path[1].x, action.path[1].y
+                print(f"Action: drag from ({x1}, {y1}) to ({x2}, {y2})")
+                return f"mouse_drag_and_drop({x1}, {y1}, {x2}, {y2})"
+
+            case _:
+                print(f"Unrecognized action: {action}")
+
+    except Exception as e:
+        print(f"Error handling action {action}: {e}")
+
+
+@dataclass
+class OpenAIResponseModelArgs(BaseModelArgs):
+    """Serializable object for instantiating a generic chat model with an OpenAI
+    model."""
+
+    def make_model(self, extra_kwargs=None):
+        return OpenAIResponseModel(
+            model_name=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_new_tokens,
+            extra_kwargs=extra_kwargs,
+        )