claude

TLSDC · TLSDC · commit ef6f648b193a · 2025-04-30T16:31:29.000-04:00
diff --git a/src/agentlab/agents/tool_use_agent/agent.py b/src/agentlab/agents/tool_use_agent/agent.py
@@ -5,17 +5,60 @@
 from typing import TYPE_CHECKING, Any
 
 import bgym
+import numpy as np
 from browsergym.core.observation import extract_screenshot
+from PIL import Image, ImageDraw
 
 from agentlab.agents.agent_args import AgentArgs
 from agentlab.llm.llm_utils import image_to_png_base64_url
-from agentlab.llm.response_api import OpenAIResponseModelArgs
+from agentlab.llm.response_api import (
+    ClaudeResponseModelArgs,
+    MessageBuilder,
+    OpenAIResponseModelArgs,
+)
 from agentlab.llm.tracking import cost_tracker_decorator
 
 if TYPE_CHECKING:
     from openai.types.responses import Response
 
 
+def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
+    """
+    If action is a coordinate action, try to render it on the screenshot.
+
+    e.g. mouse_click(120, 130) -> draw a dot at (120, 130) on the screenshot
+
+    Args:
+        screenshot: The screenshot to tag.
+        action: The action to tag the screenshot with.
+
+    Returns:
+        The tagged screenshot.
+
+    Raises:
+        ValueError: If the action parsing fails.
+    """
+    if action.startswith("mouse_click"):
+        try:
+            coords = action[action.index("(") + 1 : action.index(")")].split(",")
+            coords = [c.strip() for c in coords]
+            if len(coords) != 2:
+                raise ValueError(f"Invalid coordinate format: {coords}")
+            if coords[0].startswith("x="):
+                coords[0] = coords[0][2:]
+            if coords[1].startswith("y="):
+                coords[1] = coords[1][2:]
+            x, y = float(coords[0].strip()), float(coords[1].strip())
+            draw = ImageDraw.Draw(screenshot)
+            radius = 5
+            draw.ellipse(
+                (x - radius, y - radius, x + radius, y + radius), fill="red", outline="red"
+            )
+        except (ValueError, IndexError) as e:
+            logging.warning(f"Failed to parse action '{action}': {e}")
+    return screenshot
+
+
 @dataclass
 class ToolUseAgentArgs(AgentArgs):
     temperature: float = 0.1
@@ -48,14 +91,18 @@ def __init__(
         self,
         temperature: float,
         model_args: OpenAIResponseModelArgs,
+        use_first_obs: bool = True,
+        tag_screenshot: bool = True,
     ):
         self.temperature = temperature
         self.chat = model_args.make_model()
         self.model_args = model_args
+        self.use_first_obs = use_first_obs
+        self.tag_screenshot = tag_screenshot
 
         self.action_set = bgym.HighLevelActionSet(["coord"], multiaction=False)
 
-        self.tools = self.action_set.to_tool_description()
+        self.tools = self.action_set.to_tool_description(api="anthropic")
 
         # self.tools.append(
         #     {
@@ -77,87 +124,94 @@ def __init__(
 
         self.llm = model_args.make_model(extra_kwargs={"tools": self.tools})
 
-        self.messages = []
+        self.messages: list[MessageBuilder] = []
 
     def obs_preprocessor(self, obs):
         page = obs.pop("page", None)
         if page is not None:
             obs["screenshot"] = extract_screenshot(page)
+            if self.tag_screenshot:
+                obs["screenshot"] = Image.fromarray(obs["screenshot"])
+                obs["screenshot"] = tag_screenshot_with_action(
+                    obs["screenshot"], obs["last_action"]
+                )
+                obs["screenshot"] = np.array(obs["screenshot"])
         else:
             raise ValueError("No page found in the observation.")
 
         return obs
 
     @cost_tracker_decorator
-    def get_action(self, obs: Any) -> tuple[str, dict]:
-
+    def get_action(self, obs: Any) -> float:
         if len(self.messages) == 0:
-            system_message = {
-                "role": "system",
-                "content": "You are an agent. Based on the observation, you will decide which action to take to accomplish your goal.",
-            }
-            goal_object = [el for el in obs["goal_object"]]
-            for content in goal_object:
-                if content["type"] == "text":
-                    content["type"] = "input_text"
-                elif content["type"] == "image_url":
-                    content["type"] = "input_image"
-            goal_message = {"role": "user", "content": goal_object}
-            goal_message["content"].append(
-                {
-                    "type": "input_image",
-                    "image_url": image_to_png_base64_url(obs["screenshot"]),
-                }
+            system_message = MessageBuilder.system().add_text(
+                "You are an agent. Based on the observation, you will decide which action to take to accomplish your goal."
             )
             self.messages.append(system_message)
+
+            goal_message = MessageBuilder.user()
+            for content in obs["goal_object"]:
+                if content["type"] == "text":
+                    goal_message.add_text(content["text"])
+                elif content["type"] == "image_url":
+                    goal_message.add_image(content["image_url"])
             self.messages.append(goal_message)
+
+            if self.use_first_obs:
+                message = MessageBuilder.user().add_text(
+                    "Here is the first observation. A red dot on screenshots indicate the previous click action:"
+                )
+                message.add_image(image_to_png_base64_url(obs["screenshot"]))
+                self.messages.append(message)
         else:
             if obs["last_action_error"] == "":
-                self.messages.append(
-                    {
-                        "type": "function_call_output",
-                        "call_id": self.previous_call_id,
-                        "output": "Function call executed, see next observation.",
-                    }
-                )
-                self.messages.append(
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "input_image",
-                                "image_url": image_to_png_base64_url(obs["screenshot"]),
-                            }
-                        ],
-                    }
+                tool_message = MessageBuilder.tool().add_image(
+                    image_to_png_base64_url(obs["screenshot"])
                 )
+                tool_message.add_tool_id(self.previous_call_id)
+                self.messages.append(tool_message)
             else:
-                self.messages.append(
-                    {
-                        "type": "function_call_output",
-                        "call_id": self.previous_call_id,
-                        "output": f"Function call failed: {obs['last_action_error']}",
-                    }
+                tool_message = MessageBuilder.tool().add_text(
+                    f"Function call failed: {obs['last_action_error']}"
                 )
+                tool_message.add_tool_id(self.previous_call_id)
+                self.messages.append(tool_message)
 
+        messages = []
+        for msg in self.messages:
+            if isinstance(msg, MessageBuilder):
+                messages += msg.to_anthropic()
+            else:
+                messages.append(msg)
         response: "Response" = self.llm(
-            messages=self.messages,
+            messages=messages,
             temperature=self.temperature,
         )
 
         action = "noop()"
         think = ""
-        for output in response.output:
-            if output.type == "function_call":
-                arguments = json.loads(output.arguments)
-                action = f"{output.name}({", ".join([f"{k}={v}" for k, v in arguments.items()])})"
-                self.previous_call_id = output.call_id
-                self.messages.append(output)
-                break
-            elif output.type == "reasoning":
-                if len(output.summary) > 0:
-                    think += output.summary[0].text + "\n"
-                self.messages.append(output)
+        # openai
+        # for output in response.output:
+        #     if output.type == "function_call":
+        #         arguments = json.loads(output.arguments)
+        #         action = f"{output.name}({", ".join([f"{k}={v}" for k, v in arguments.items()])})"
+        #         self.previous_call_id = output.call_id
+        #         self.messages.append(output)
+        #         break
+        #     elif output.type == "reasoning":
+        #         if len(output.summary) > 0:
+        #             think += output.summary[0].text + "\n"
+        #         self.messages.append(output)
+
+        # anthropic
+        for output in response.content:
+            if output.type == "text":
+                think += output.text
+            elif output.type == "tool_use":
+                action = f"{output.name}({', '.join([f'{k}=\"{v}\"' if isinstance(v, str) else f'{k}={v}' for k, v in output.input.items()])})"
+                self.previous_call_id = output.id
+
+        self.messages.append({"role": "assistant", "content": response.content})
 
         return (
             action,
@@ -170,15 +224,26 @@ def get_action(self, obs: Any) -> tuple[str, dict]:
 
 
 MODEL_CONFIG = OpenAIResponseModelArgs(
-    model_name="o4-mini-2025-04-16",
+    model_name="gpt-4o",
+    max_total_tokens=200_000,
+    max_input_tokens=200_000,
+    max_new_tokens=2_000,
+    temperature=0.1,
+    vision_support=True,
+)
+
+
+CLAUDE_MODEL_CONFIG = ClaudeResponseModelArgs(
+    model_name="claude-3-7-sonnet-20250219",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
-    max_new_tokens=100_000,
+    max_new_tokens=2_000,
     temperature=0.1,
     vision_support=True,
 )
 
+
 AGENT_CONFIG = ToolUseAgentArgs(
     temperature=0.1,
-    model_args=MODEL_CONFIG,
+    model_args=CLAUDE_MODEL_CONFIG,
 )
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -560,7 +560,7 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
             draw = ImageDraw.Draw(screenshot)
             radius = 5
             draw.ellipse(
-                (x - radius, y - radius, x + radius, y + radius), fill="red", outline="red"
+                (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue"
             )
         except (ValueError, IndexError) as e:
             warning(f"Failed to parse action '{action}': {e}")
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
@@ -72,17 +72,29 @@ def to_openai(self) -> List[Message]:
 
     def to_anthropic(self) -> List[Message]:
         content = []
+
+        if self.role == "system":
+            logging.warning(
+                "In the Anthropic API, system messages should be passed as a direct input to the client."
+            )
+            return []
+
         for item in self.content:
             if "text" in item:
                 content.append({"type": "text", "text": item["text"]})
             elif "image" in item:
+                img_str: str = item["image"]
+                # make sure to get rid of the image type for anthropic
+                # e.g. "data:image/png;base64"
+                if img_str.startswith("data:image/png;base64,"):
+                    img_str = img_str[len("data:image/png;base64,") :]
                 content.append(
                     {
                         "type": "image",
                         "source": {
                             "type": "base64",  # currently only base64 is supported
                             "media_type": "image/png",  # currently only png is supported
-                            "data": item["image"],
+                            "data": img_str,
                         },
                     }
                 )
@@ -91,11 +103,26 @@ def to_anthropic(self) -> List[Message]:
         if self.role == "tool":
             assert self.tool_call_id is not None, "Tool call ID is required for tool messages"
             res[0]["role"] = "user"
-            res[0]["content"] = {
-                "type": "tool_result",
-                "tool_use_id": self.tool_call_id,
-                "content": res[0]["content"],
-            }
+            res[0]["content"] = [
+                {
+                    "type": "tool_result",
+                    "tool_use_id": self.tool_call_id,
+                    "content": res[0]["content"],
+                }
+            ]
+        return res
+
+    def to_markdown(self) -> str:
+        content = []
+        for item in self.content:
+            if "text" in item:
+                content.append(item["text"])
+            elif "image" in item:
+                content.append(f"![image]({item['image']})")
+        res = f"{self.role}: " + "\n".join(content)
+        if self.role == "tool":
+            assert self.tool_call_id is not None, "Tool call ID is required for tool messages"
+            res += f"\nTool call ID: {self.tool_call_id}"
         return res
 
 

Original file line number	Diff line number	Diff line change
`@@ -560,7 +560,7 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:`
`560`	`560`	`draw = ImageDraw.Draw(screenshot)`
`561`	`561`	`radius = 5`
`562`	`562`	`draw.ellipse(`
`563`		`- (x - radius, y - radius, x + radius, y + radius), fill="red", outline="red"`
	`563`	`+ (x - radius, y - radius, x + radius, y + radius), fill="blue", outline="blue"`
`564`	`564`	`)`
`565`	`565`	`except (ValueError, IndexError) as e:`
`566`	`566`	`warning(f"Failed to parse action '{action}': {e}")`