Merge remote-tracking branch 'origin/allac/next-agent' into aj/tool_use_agent_chat_completion_support

amanjaiswal73892 · amanjaiswal73892 · commit a16f0249b9e7 · 2025-06-11T17:52:17.000-04:00
diff --git a/src/agentlab/agents/agent_utils.py b/src/agentlab/agents/agent_utils.py
@@ -1,10 +1,7 @@
 from logging import warning
-from playwright.sync_api import Page 
 
 from PIL import Image, ImageDraw
-from logging import warning
-from playwright.sync_api import Page 
-
+from playwright.sync_api import Page
 
 """
 This module contains utility functions for handling observations and actions in the context of agent interactions.
@@ -87,6 +84,57 @@ def draw_mouse_pointer(image: Image.Image, x: int, y: int) -> Image.Image:
 
     return Image.alpha_composite(image.convert("RGBA"), overlay)
 
+
+def draw_click_indicator(image: Image.Image, x: int, y: int) -> Image.Image:
+    """
+    Draws a click indicator (+ shape with disconnected lines) at (x, y) on the image.
+    Returns a new image with the click indicator drawn.
+    """
+    line_length = 10  # Length of each line segment
+    gap = 4  # Gap from center point
+    line_width = 2  # Thickness of lines
+
+    overlay = image.convert("RGBA").copy()
+    draw = ImageDraw.Draw(overlay)
+
+    # Draw 4 lines forming a + shape with gaps in the center
+    # Each line has a white outline and black center for visibility on any background
+
+    # Top line
+    draw.line(
+        [(x, y - gap - line_length), (x, y - gap)], fill=(255, 255, 255, 200), width=line_width + 2
+    )  # White outline
+    draw.line(
+        [(x, y - gap - line_length), (x, y - gap)], fill=(0, 0, 0, 255), width=line_width
+    )  # Black center
+
+    # Bottom line
+    draw.line(
+        [(x, y + gap), (x, y + gap + line_length)], fill=(255, 255, 255, 200), width=line_width + 2
+    )  # White outline
+    draw.line(
+        [(x, y + gap), (x, y + gap + line_length)], fill=(0, 0, 0, 255), width=line_width
+    )  # Black center
+
+    # Left line
+    draw.line(
+        [(x - gap - line_length, y), (x - gap, y)], fill=(255, 255, 255, 200), width=line_width + 2
+    )  # White outline
+    draw.line(
+        [(x - gap - line_length, y), (x - gap, y)], fill=(0, 0, 0, 255), width=line_width
+    )  # Black center
+
+    # Right line
+    draw.line(
+        [(x + gap, y), (x + gap + line_length, y)], fill=(255, 255, 255, 200), width=line_width + 2
+    )  # White outline
+    draw.line(
+        [(x + gap, y), (x + gap + line_length, y)], fill=(0, 0, 0, 255), width=line_width
+    )  # Black center
+
+    return Image.alpha_composite(image.convert("RGBA"), overlay)
+
+
 def zoom_webpage(page: Page, zoom_factor: float = 1.5):
     """
     Zooms the webpage to the specified zoom factor.
diff --git a/src/agentlab/agents/tool_use_agent/multi_tool_agent.py b/src/agentlab/agents/tool_use_agent/multi_tool_agent.py
@@ -2,7 +2,7 @@
 import logging
 from abc import ABC, abstractmethod
 from copy import copy
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Any
 
@@ -54,6 +54,55 @@ def apply(self, llm, messages: list[MessageBuilder], **kwargs):
         pass
 
 
+@dataclass
+class MsgGroup:
+    name: str = None
+    messages: list[MessageBuilder] = field(default_factory=list)
+    summary: MessageBuilder = None
+
+
+class StructuredDiscussion:
+
+    def __init__(self, keep_last_n_obs=None):
+        self.groups: list[MsgGroup] = []
+        self.keep_last_n_obs = keep_last_n_obs
+
+    def append(self, message: MessageBuilder):
+        """Append a message to the last group."""
+        self.groups[-1].messages.append(message)
+
+    def new_group(self, name: str = None):
+        """Start a new group of messages."""
+        if name is None:
+            name = f"group_{len(self.groups)}"
+        self.groups.append(MsgGroup(name))
+
+    def flatten(self) -> list[MessageBuilder]:
+        """Flatten the groups into a single list of messages."""
+
+        keep_last_n_obs = self.keep_last_n_obs or len(self.groups)
+        messages = []
+        for i, group in enumerate(self.groups):
+            is_tail = i >= len(self.groups) - keep_last_n_obs
+            print(
+                f"Processing group {i} ({group.name}), is_tail={is_tail}, len(greoup)={len(group.messages)}"
+            )
+            if not is_tail and group.summary is not None:
+                messages.append(group.summary)
+            else:
+                messages.extend(group.messages)
+
+        return messages
+
+    def set_last_summary(self, summary: MessageBuilder):
+        # append None to summaries until we reach the current group index
+        self.groups[-1].summary = summary
+
+    def is_goal_set(self) -> bool:
+        """Check if the goal is set in the first group."""
+        return len(self.groups) > 0
+
+
 # @dataclass
 # class BlockArgs(ABC):
 
@@ -74,9 +123,9 @@ class Goal(Block):
 
     goal_as_system_msg: bool = True
 
-    def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
+    def apply(self, llm, discussion: StructuredDiscussion, obs: dict) -> dict:
         system_message = llm.msg.system().add_text(SYS_MSG)
-        messages.append(system_message)
+        discussion.append(system_message)
 
         if self.goal_as_system_msg:
             goal_message = llm.msg.system()
@@ -89,7 +138,7 @@ def apply(self, llm, messages: list[MessageBuilder], obs: dict) -> dict:
                 goal_message.add_text(content["text"])
             elif content["type"] == "image_url":
                 goal_message.add_image(content["image_url"])
-        messages.append(goal_message)
+        discussion.append(goal_message)
 
 
 AXTREE_NOTE = """
@@ -108,11 +157,11 @@ class Obs(Block):
     use_dom: bool = False
     use_som: bool = False
     use_tabs: bool = False
-    add_mouse_pointer: bool = True
+    add_mouse_pointer: bool = False
     use_zoomed_webpage: bool = False
 
     def apply(
-        self, llm, messages: list[MessageBuilder], obs: dict, last_llm_output: LLMOutput
+        self, llm, discussion: StructuredDiscussion, obs: dict, last_llm_output: LLMOutput
     ) -> dict:
 
         if last_llm_output.tool_calls is None:
@@ -147,7 +196,7 @@ def apply(
         if self.use_tabs:
             obs_msg.add_text(_format_tabs(obs))
 
-        messages.append(obs_msg)
+        discussion.append(obs_msg)
         return obs_msg
 
 
@@ -172,7 +221,7 @@ class GeneralHints(Block):
 
     use_hints: bool = True
 
-    def apply(self, llm, messages: list[MessageBuilder]) -> dict:
+    def apply(self, llm, discussion: StructuredDiscussion) -> dict:
         if not self.use_hints:
             return
 
@@ -182,7 +231,7 @@ def apply(self, llm, messages: list[MessageBuilder]) -> dict:
             """Use ControlOrMeta instead of Control and Meta for keyboard shortcuts, to be cross-platform compatible. E.g. use ControlOrMeta for mutliple selection in lists.\n"""
         )
 
-        messages.append(llm.msg.user().add_text("\n".join(hints)))
+        discussion.append(llm.msg.user().add_text("\n".join(hints)))
 
 
 @dataclass
@@ -192,20 +241,22 @@ class Summarizer(Block):
     do_summary: bool = False
     high_details: bool = True
 
-    def apply(self, llm, messages: list[MessageBuilder]) -> dict:
+    def apply(self, llm, discussion: StructuredDiscussion) -> dict:
         if not self.do_summary:
             return
 
         msg = llm.msg.user().add_text("""Summarize\n""")
 
-        messages.append(msg)
+        discussion.append(msg)
         # TODO need to make sure we don't force tool use here
-        summary_response = llm(messages=messages, tool_choice="none")
+        summary_response = llm(messages=discussion.flatten(), tool_choice="none")
 
         summary_msg = llm.msg.assistant().add_text(summary_response.think)
-        messages.append(summary_msg)
+        discussion.append(summary_msg)
+        discussion.set_last_summary(summary_msg)
+        return summary_msg
 
-    def apply_init(self, llm, messages: list[MessageBuilder]) -> dict:
+    def apply_init(self, llm, discussion: StructuredDiscussion) -> dict:
         """Initialize the summarizer block."""
         if not self.do_summary:
             return
@@ -215,19 +266,18 @@ def apply_init(self, llm, messages: list[MessageBuilder]) -> dict:
             # Add a system message to the LLM to indicate that it should summarize
             system_msg.add_text(
                 """# Summarizer instructions:\nWhen asked to summarize, do the following:
-    1) Summarize the effect of the last action, with attention to details.
-    2) Give a semantic description of the current state of the environment, with attention to details. If there was a repeating mistake, mention the cause of it.
-    3) Reason about the overall task at a high level.
-    4) What hint can be relevant for the next action? Only chose from the hints provided in the task description. Or select none.
-    5) What is the currently activated item if any.
-    6) Reason about the next action to take, based on the current state and the goal.
-    """
+1) Summarize the effect of the last action, with attention to details.
+2) Give a semantic description of the current state of the environment, with attention to details. If there was a repeating mistake, mention the cause of it.
+3) Reason about the overall task at a high level.
+4) What hint can be relevant for the next action? Only chose from the hints provided in the task description. Or select none.
+5) Reason about the next action to take, based on the current state and the goal.
+"""
             )
         else:
             system_msg.add_text(
                 """When asked to summarize, give a semantic description of the current state of the environment."""
             )
-        messages.append(system_msg)
+        discussion.append(system_msg)
 
 
 @dataclass
@@ -243,7 +293,7 @@ def _init(self):
         # index the task_name for fast lookup
         # self.hint_db.set_index("task_name", inplace=True, drop=False)
 
-    def apply(self, llm, messages: list[MessageBuilder], task_name: str) -> dict:
+    def apply(self, llm, discussion: StructuredDiscussion, task_name: str) -> dict:
         if not self.use_task_hint:
             return
 
@@ -257,12 +307,14 @@ def apply(self, llm, messages: list[MessageBuilder], task_name: str) -> dict:
             if hint:
                 hints.append(f"- {hint}")
 
-        hints_str = "# Hints:\nHere are some hints for the task you are working on:\n" + "\n".join(
-            hints
-        )
-        msg = llm.msg.user().add_text(hints_str)
+        if len(hints) > 0:
+            hints_str = (
+                "# Hints:\nHere are some hints for the task you are working on:\n"
+                + "\n".join(hints)
+            )
+            msg = llm.msg.user().add_text(hints_str)
 
-        messages.append(msg)
+            discussion.append(msg)
 
 
 class ToolCall(Block):
@@ -292,6 +344,7 @@ class PromptConfig:
     summarizer: Summarizer = None
     general_hints: GeneralHints = None
     task_hint: TaskHint = None
+    keep_last_n_obs: int = 2
 
 
 @dataclass
@@ -339,8 +392,9 @@ def __init__(
         self.llm.msg = self.msg_builder
 
         self.task_hint = self.config.task_hint.make()
+        self.obs_block = self.config.obs.make()
 
-        self.messages: list[MessageBuilder] = []
+        self.discussion = StructuredDiscussion(self.config.keep_last_n_obs)
         self.last_response: LLMOutput = LLMOutput()
         self._responses: list[LLMOutput] = []
 
@@ -380,37 +434,41 @@ def set_task_name(self, task_name: str):
     @cost_tracker_decorator
     def get_action(self, obs: Any) -> float:
         self.llm.reset_stats()
-        if len(self.messages) == 0:
-            self.config.goal.apply(self.llm, self.messages, obs)
-            self.config.summarizer.apply_init(self.llm, self.messages)
-            self.config.general_hints.apply(self.llm, self.messages)
-            self.task_hint.apply(self.llm, self.messages, self.task_name)
-
-        logging.info("Appending observation to messages")
-        self.config.obs.apply(self.llm, self.messages, obs, last_llm_output=self.last_response)
-        logging.info("Calling summarizer")
-        self.config.summarizer.apply(self.llm, self.messages)
-        logging.info("Main tool calling")
+        if not self.discussion.is_goal_set():
+            self.discussion.new_group("goal")
+            self.config.goal.apply(self.llm, self.discussion, obs)
+            self.config.summarizer.apply_init(self.llm, self.discussion)
+            self.config.general_hints.apply(self.llm, self.discussion)
+            self.task_hint.apply(self.llm, self.discussion, self.task_name)
+
+            self.discussion.new_group()
+
+        self.obs_block.apply(self.llm, self.discussion, obs, last_llm_output=self.last_response)
+        print("flatten for summary")
+
+        self.config.summarizer.apply(self.llm, self.discussion)
+
+        messages = self.discussion.flatten()
         response: LLMOutput = self.llm(
-            messages=self.messages,
+            messages=messages,
             tool_choice="any",
             cache_tool_definition=True,
             cache_complete_prompt=True,
         )
-        logging.info(f"Obtained response {response}")
 
         action = response.action
         think = response.think
 
-        self.messages.append(response.tool_calls)
+        self.discussion.new_group()
+        self.discussion.append(response.tool_calls)
 
         self.last_response = response
         self._responses.append(response)  # may be useful for debugging
         # self.messages.append(response.assistant_message)  # this is tool call
 
         agent_info = bgym.AgentInfo(
             think=think,
-            chat_messages=self.messages,
+            chat_messages=messages,
             stats=self.llm.stats.stats_dict,
         )
         return action, agent_info