unified claude and openai response apis

TLSDC · TLSDC · commit 0fc43cce1da8 · 2025-05-02T15:11:38.000-04:00
diff --git a/src/agentlab/agents/tool_use_agent/agent.py b/src/agentlab/agents/tool_use_agent/agent.py
@@ -15,6 +15,7 @@
     ClaudeResponseModelArgs,
     MessageBuilder,
     OpenAIResponseModelArgs,
+    ResponseLLMOutput,
 )
 from agentlab.llm.tracking import cost_tracker_decorator
 
@@ -61,8 +62,9 @@ def tag_screenshot_with_action(screenshot: Image, action: str) -> Image:
 
 @dataclass
 class ToolUseAgentArgs(AgentArgs):
-    temperature: float = 0.1
     model_args: OpenAIResponseModelArgs = None
+    use_first_obs: bool = True
+    tag_screenshot: bool = True
 
     def __post_init__(self):
         try:
@@ -72,13 +74,11 @@ def __post_init__(self):
 
     def make_agent(self) -> bgym.Agent:
         return ToolUseAgent(
-            temperature=self.temperature,
             model_args=self.model_args,
+            use_first_obs=self.use_first_obs,
+            tag_screenshot=self.tag_screenshot,
         )
 
-    def set_reproducibility_mode(self):
-        self.temperature = 0
-
     def prepare(self):
         return self.model_args.prepare_server()
 
@@ -89,20 +89,18 @@ def close(self):
 class ToolUseAgent(bgym.Agent):
     def __init__(
         self,
-        temperature: float,
         model_args: OpenAIResponseModelArgs,
         use_first_obs: bool = True,
         tag_screenshot: bool = True,
     ):
-        self.temperature = temperature
         self.chat = model_args.make_model()
         self.model_args = model_args
         self.use_first_obs = use_first_obs
         self.tag_screenshot = tag_screenshot
 
         self.action_set = bgym.HighLevelActionSet(["coord"], multiaction=False)
 
-        self.tools = self.action_set.to_tool_description(api="anthropic")
+        self.tools = self.action_set.to_tool_description(api=model_args.api)
 
         # self.tools.append(
         #     {
@@ -131,11 +129,9 @@ def obs_preprocessor(self, obs):
         if page is not None:
             obs["screenshot"] = extract_screenshot(page)
             if self.tag_screenshot:
-                obs["screenshot"] = Image.fromarray(obs["screenshot"])
-                obs["screenshot"] = tag_screenshot_with_action(
-                    obs["screenshot"], obs["last_action"]
-                )
-                obs["screenshot"] = np.array(obs["screenshot"])
+                screenshot = Image.fromarray(obs["screenshot"])
+                screenshot = tag_screenshot_with_action(screenshot, obs["last_action"])
+                obs["screenshot_tag"] = np.array(screenshot)
         else:
             raise ValueError("No page found in the observation.")
 
@@ -158,16 +154,25 @@ def get_action(self, obs: Any) -> float:
             self.messages.append(goal_message)
 
             if self.use_first_obs:
-                message = MessageBuilder.user().add_text(
-                    "Here is the first observation. A red dot on screenshots indicate the previous click action:"
-                )
-                message.add_image(image_to_png_base64_url(obs["screenshot"]))
+                if self.tag_screenshot:
+                    message = MessageBuilder.user().add_text(
+                        "Here is the first observation. A red dot on screenshots indicate the previous click action:"
+                    )
+                    message.add_image(image_to_png_base64_url(obs["screenshot_tag"]))
+                else:
+                    message = MessageBuilder.user().add_text("Here is the first observation:")
+                    message.add_image(image_to_png_base64_url(obs["screenshot"]))
                 self.messages.append(message)
         else:
             if obs["last_action_error"] == "":
-                tool_message = MessageBuilder.tool().add_image(
-                    image_to_png_base64_url(obs["screenshot"])
-                )
+                if self.tag_screenshot:
+                    tool_message = MessageBuilder.tool().add_image(
+                        image_to_png_base64_url(obs["screenshot_tag"])
+                    )
+                else:
+                    tool_message = MessageBuilder.tool().add_image(
+                        image_to_png_base64_url(obs["screenshot"])
+                    )
                 tool_message.add_tool_id(self.previous_call_id)
                 self.messages.append(tool_message)
             else:
@@ -177,21 +182,12 @@ def get_action(self, obs: Any) -> float:
                 tool_message.add_tool_id(self.previous_call_id)
                 self.messages.append(tool_message)
 
-        messages = []
-        for msg in self.messages:
-            if isinstance(msg, MessageBuilder):
-                messages += msg.to_anthropic()
-            else:
-                messages.append(msg)
-        response: "Response" = self.llm(
-            messages=messages,
-            temperature=self.temperature,
-        )
+        response: ResponseLLMOutput = self.llm(messages=self.messages)
 
-        action = response["action"]
-        think = response["think"]
-        self.previous_call_id = response["last_computer_call_id"]
-        self.messages.append(response["assistant_message"])
+        action = response.action
+        think = response.think
+        self.previous_call_id = response.last_computer_call_id
+        self.messages.append(response.assistant_message)
 
         return (
             action,
@@ -203,8 +199,8 @@ def get_action(self, obs: Any) -> float:
         )
 
 
-MODEL_CONFIG = OpenAIResponseModelArgs(
-    model_name="gpt-4o",
+OPENAI_MODEL_CONFIG = OpenAIResponseModelArgs(
+    model_name="gpt-4.1",
     max_total_tokens=200_000,
     max_input_tokens=200_000,
     max_new_tokens=2_000,
@@ -224,6 +220,5 @@ def get_action(self, obs: Any) -> float:
 
 
 AGENT_CONFIG = ToolUseAgentArgs(
-    temperature=0.1,
-    model_args=CLAUDE_MODEL_CONFIG,
+    model_args=OPENAI_MODEL_CONFIG,
 )
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -634,6 +634,9 @@ def dict_to_markdown(d: dict):
 
     dict: type = dict[str, str | list[dict[...]]]
     """
+    if not isinstance(d, dict):
+        warning(f"Expected dict, got {type(d)}")
+        return repr(d)
     if not d:
         return "No Data"
     res = ""
@@ -661,7 +664,7 @@ def update_chat_messages():
 
     if isinstance(chat_messages, list) and isinstance(chat_messages[0], MessageBuilder):
         chat_messages = [
-            m.to_markdown() if not isinstance(m, dict) else dict_to_markdown(m)
+            m.to_markdown() if isinstance(m, MessageBuilder) else dict_to_markdown(m)
             for m in chat_messages
         ]
         return "\n\n".join(chat_messages)
diff --git a/src/agentlab/llm/response_api.py b/src/agentlab/llm/response_api.py
@@ -15,6 +15,17 @@
 type Message = Dict[str, Union[str, List[ContentItem]]]
 
 
+@dataclass
+class ResponseLLMOutput:
+    """Serializable object for the output of a response LLM."""
+
+    raw_response: Any
+    think: str
+    action: str
+    last_computer_call_id: str
+    assistant_message: Any
+
+
 class MessageBuilder:
     def __init__(self, role: str):
         self.role = role
@@ -63,13 +74,17 @@ def to_openai(self) -> List[Message]:
             # tool messages can only take text with openai
             # we need to split the first content element if it's text and use it
             # then open a new (user) message with the rest
-            res[0]["tool_call_id"] = self.tool_call_id
+            # a function_call_output dict has keys "call_id", "type" and "output"
+            res[0]["call_id"] = self.tool_call_id
+            res[0]["type"] = "function_call_output"
+            res[0].pop("role", None)  # make sure to remove role
             text_content = (
                 content.pop(0)["text"]
                 if "text" in content[0]
                 else "Tool call answer in next message"
             )
-            res[0]["content"] = text_content
+            res[0]["output"] = text_content
+            res[0].pop("content", None)  # make sure to remove content
             res.append({"role": "user", "content": content})
 
         return res
@@ -116,6 +131,8 @@ def to_anthropic(self) -> List[Message]:
             ]
         return res
 
+    def to_chat_completion(self) -> List[Message]: ...
+
     def to_markdown(self) -> str:
         content = []
         for item in self.content:
@@ -159,12 +176,12 @@ def __call__(self, messages: list[dict | MessageBuilder]) -> dict:
         return self._parse_response(response)
 
     @abstractmethod
-    def _call_api(self, messages: list[dict | MessageBuilder]) -> dict:
+    def _call_api(self, messages: list[dict | MessageBuilder]) -> Any:
         """Make a call to the model API and return the raw response."""
         pass
 
     @abstractmethod
-    def _parse_response(self, response: dict) -> dict:
+    def _parse_response(self, response: Any) -> ResponseLLMOutput:
         """Parse the raw response from the model API and return a structured response."""
         pass
 
@@ -187,11 +204,17 @@ def __init__(
         )
         self.client = OpenAI(api_key=api_key)
 
-    def _call_api(self, messages: list[dict | MessageBuilder]) -> dict:
+    def _call_api(self, messages: list[Any | MessageBuilder]) -> dict:
+        input = []
+        for msg in messages:
+            if isinstance(msg, MessageBuilder):
+                input += msg.to_openai()
+            else:
+                input.append(msg)
         try:
             response = self.client.responses.create(
                 model=self.model_name,
-                input=messages,
+                input=input,
                 temperature=self.temperature,
                 # previous_response_id=content.get("previous_response_id", None),
                 max_output_tokens=self.max_tokens,
@@ -208,27 +231,25 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> dict:
             raise e
 
     def _parse_response(self, response: dict) -> dict:
-        result = {
-            "raw_response": response,
-            "think": "",
-            "action": "noop()",
-            "last_computer_call_id": None,
-            "assistant_message": {
-                "role": "assistant",
-                "content": response.output,
-            },
-        }
+        result = ResponseLLMOutput(
+            raw_response=response,
+            think="",
+            action="noop()",
+            last_computer_call_id=None,
+            assistant_message=None,
+        )
         for output in response.output:
             if output.type == "function_call":
                 arguments = json.loads(output.arguments)
-                result["action"] = (
+                result.action = (
                     f"{output.name}({", ".join([f"{k}={v}" for k, v in arguments.items()])})"
                 )
-                result["last_computer_call_id"] = output.call_id
+                result.last_computer_call_id = output.call_id
+                result.assistant_message = output
                 break
             elif output.type == "reasoning":
                 if len(output.summary) > 0:
-                    result["think"] += output.summary[0].text + "\n"
+                    result.think += output.summary[0].text + "\n"
         return result
 
 
@@ -251,10 +272,16 @@ def __init__(
         self.client = Anthropic(api_key=api_key)
 
     def _call_api(self, messages: list[dict | MessageBuilder]) -> dict:
+        input = []
+        for msg in messages:
+            if isinstance(msg, MessageBuilder):
+                input += msg.to_anthropic()
+            else:
+                input.append(msg)
         try:
             response = self.client.messages.create(
                 model=self.model_name,
-                messages=messages,
+                messages=input,
                 temperature=self.temperature,
                 max_tokens=self.max_tokens,
                 **self.extra_kwargs,
@@ -265,24 +292,22 @@ def _call_api(self, messages: list[dict | MessageBuilder]) -> dict:
             raise e
 
     def _parse_response(self, response: dict) -> dict:
-        result = {
-            "raw_response": response,
-            "think": "",
-            "action": "noop()",
-            "last_computer_call_id": None,
-            "assistant_message": {
+        result = ResponseLLMOutput(
+            raw_response=response,
+            think="",
+            action="noop()",
+            last_computer_call_id=None,
+            assistant_message={
                 "role": "assistant",
                 "content": response.content,
             },
-        }
+        )
         for output in response.content:
             if output.type == "tool_use":
-                result["action"] = (
-                    f"{output.name}({', '.join([f'{k}=\"{v}\"' if isinstance(v, str) else f'{k}={v}' for k, v in output.input.items()])})"
-                )
-                result["last_computer_call_id"] = output.id
+                result.action = f"{output.name}({', '.join([f'{k}=\"{v}\"' if isinstance(v, str) else f'{k}={v}' for k, v in output.input.items()])})"
+                result.last_computer_call_id = output.id
             elif output.type == "text":
-                result["think"] += output.text
+                result.think += output.text
         return result
 
 
@@ -358,6 +383,8 @@ class OpenAIResponseModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI
     model."""
 
+    api = "openai"
+
     def make_model(self, extra_kwargs=None):
         return OpenAIResponseModel(
             model_name=self.model_name,
@@ -372,6 +399,8 @@ class ClaudeResponseModelArgs(BaseModelArgs):
     """Serializable object for instantiating a generic chat model with an OpenAI
     model."""
 
+    api = "anthropic"
+
     def make_model(self, extra_kwargs=None):
         return ClaudeResponseModel(
             model_name=self.model_name,