simplify history format

ollmer · ollmer · commit d9c921618f79 · 2025-11-25T17:27:12.000Z
diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py
@@ -1,11 +1,12 @@
 import json
 import logging
 import pprint
+import time
 from dataclasses import dataclass
 from functools import partial
-from typing import Callable
+from typing import Callable, Literal
 
-from litellm import completion_with_retries
+from litellm import completion
 from litellm.types.utils import ChatCompletionMessageToolCall, Message, ModelResponse
 from PIL import Image
 from termcolor import colored
@@ -17,81 +18,24 @@
 
 logger = logging.getLogger(__name__)
 
-@dataclass
-class Observation:
-    data: dict # expected keys: goal_object, pruned_html, axtree_txt, screenshot, last_action_error, action_result
-
-    def to_messages(self) -> list[dict]:
-        """
-        Convert the observation dictionary into a list of chat messages for Lite LLM
-        """
-        messages = []
-        tool_call_id = self.data.get("tool_call_id")
-        if self.data.get("goal_object") and not tool_call_id: # its a first observation when there are no tool_call_id, so include goal
-            goal=self.data["goal_object"][0]["text"]
-            messages.append({
-                "role": "user",
-                "content": f"## Goal:\n{goal}"
-            })
-        text_obs = []
-        if self.data.get("action_result"):
-            result=self.data["action_result"]
-            text_obs.append(f"Action Result:\n{result}")
-        if self.data.get("pruned_html"):
-            html=self.data["pruned_html"]
-            text_obs.append(f"Pruned HTML:\n{html}")
-        if self.data.get("axtree_txt"):
-            axtree=self.data["axtree_txt"]
-            text_obs.append(f"Accessibility Tree:\n{axtree}")
-        if self.data.get("last_action_error"):
-            error = self.data['last_action_error']
-            text_obs.append(f"Action Error:\n{error}")
-        if text_obs:
-            if tool_call_id:
-                message = {
-                    "role": "tool",
-                    "tool_call_id": tool_call_id,
-                    "content": "\n\n".join(text_obs),
-                }
-            else:
-                message = {
-                    "role": "user",
-                    "content": "\n\n".join(text_obs),
-                }
-            messages.append(message)
-        if self.data.get("screenshot"):
-            if isinstance(self.data["screenshot"], Image.Image):
-                image_content_url = image_to_png_base64_url(self.data["screenshot"])
-                messages.append({
-                    "role": "user",
-                    "content": [{"type": "image_url", "image_url": {"url": image_content_url}}],
-                })
-            else:
-                raise ValueError(f"Expected Image.Image, got {type(self.data['screenshot'])}")
-        return messages
 
-@dataclass
-class LLMOutput:
-    """
-    LiteLLM output message containing all the llm response details, suitable for putting back into prompt to reuse KV cache
-    """
-    message: Message
-    def to_messages(self) -> list[Message]:
-        return [self.message]
-
-@dataclass
-class SystemMessage:
-    message: str
-    def to_messages(self) -> list[dict]:
-        return [{"role": "system", "content": self.message}]
+class LLMArgs(BaseModelArgs):
+    reasoning_effort: Literal["minimal", "low", "medium", "high"] = "low"
+    num_retries: int = 3
 
-@dataclass
-class UserMessage:
-    message: str
-    def to_messages(self) -> list[dict]:
-        return [{"role": "user", "content": self.message}]
+    def make_model(self) -> Callable:
+        return partial(
+            completion,
+            model=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_total_tokens,
+            max_completion_tokens=self.max_new_tokens,
+            reasoning_effort=self.reasoning_effort,
+            num_retries=self.num_retries,
+            tool_choice="auto",
+            parallel_tool_calls=False,
+        )
 
-Step = LLMOutput | Observation | SystemMessage | UserMessage
 
 @dataclass
 class AgentConfig:
@@ -112,68 +56,90 @@ class AgentConfig:
 2. Evaluate action success, explain impact on task and next steps.
 3. If you see any errors in the last observation, think about it. If there is no error, just move on.
 4. List next steps to move towards the goal and propose next immediate action.
-Then produce the function call that performs the proposed action. If the task is complete, produce the final step.
+Then produce the single function call that performs the proposed action. If the task is complete, produce the final step.
 """
 
-class LLMArgs(BaseModelArgs):
-    reasoning_effort: str = "low"
-
-    def make_model(self) -> Callable:
-        return partial(
-            completion_with_retries, 
-            model=self.model_name,
-            temperature=self.temperature,
-            max_tokens=self.max_total_tokens,
-            max_completion_tokens=self.max_new_tokens,
-            reasoning_effort=self.reasoning_effort,
-        )
 
 class ReactToolCallAgent:
-    def __init__(self, action_set: ToolsActionSet, llm: Callable, config: AgentConfig):
+    def __init__(
+        self, action_set: ToolsActionSet, llm: Callable[..., ModelResponse], config: AgentConfig
+    ):
         self.action_set = action_set
-        self.history: list[Step] = [SystemMessage(message=config.system_prompt)]
+        self.history: list[dict | Message] = [{"role": "system", "content": config.system_prompt}]
         self.llm = llm
         self.config = config
         self.last_tool_call_id: str = ""
 
     def obs_preprocessor(self, obs: dict) -> dict:
-        if not self.config.use_html:
-            obs.pop("pruned_html", None)
-        if not self.config.use_axtree:
-            obs.pop("axtree_txt", None)
-        if not self.config.use_screenshot:
-            obs.pop("screenshot", None)
-        if self.last_tool_call_id:
-            # add tool_call_id to obs for linking observation to the last executed action
-            obs["tool_call_id"] = self.last_tool_call_id
         return obs
 
+    def obs_to_messages(self, obs: dict) -> list[dict]:
+        """
+        Convert the observation dictionary into a list of chat messages for Lite LLM
+        """
+        messages = []
+        if obs.get("goal_object") and not self.last_tool_call_id:
+            # its a first observation when there are no tool_call_id, so include goal
+            goal = obs["goal_object"][0]["text"]
+            messages.append({"role": "user", "content": f"## Goal:\n{goal}"})
+        text_obs = []
+        if result := obs.get("action_result"):
+            text_obs.append(f"## Action Result:\n{result}")
+        if error := obs.get("last_action_error"):
+            text_obs.append(f"## Action Error:\n{error}")
+        if self.config.use_html and (html := obs.get("pruned_html")):
+            text_obs.append(f"## HTML:\n{html}")
+        if self.config.use_axtree and (axtree := obs.get("axtree_txt")):
+            text_obs.append(f"## Accessibility Tree:\n{axtree}")
+        content = "\n\n".join(text_obs)
+        if content:
+            if self.last_tool_call_id:
+                message = {
+                    "role": "tool",
+                    "tool_call_id": self.last_tool_call_id,
+                    "content": content,
+                }
+            else:
+                message = {"role": "user", "content": content}
+            messages.append(message)
+        if self.config.use_screenshot and (scr := obs.get("screenshot")):
+            if isinstance(scr, Image.Image):
+                image_content = [
+                    {"type": "image_url", "image_url": {"url": image_to_png_base64_url(scr)}}
+                ]
+                messages.append({"role": "user", "content": image_content})
+            else:
+                raise ValueError(
+                    f"Expected Image.Image in screenshot obs, got {type(obs['screenshot'])}"
+                )
+        return messages
+
     def get_action(self, obs: dict) -> tuple[ToolCallAction, dict]:
-        prev_actions = [step for step in self.history if isinstance(step, LLMOutput)]
-        if len(prev_actions) >= self.config.max_actions:
+        actions_count = len(
+            [msg for msg in self.history if isinstance(msg, Message) and msg.tool_calls]
+        )
+        if actions_count >= self.config.max_actions:
             logger.warning("Max actions reached, stopping agent.")
-            stop_action = ToolCallAction(id="stop", function=FunctionCall(name="final_step", arguments={}))
+            stop_action = ToolCallAction(
+                id="stop", function=FunctionCall(name="final_step", arguments={})
+            )
             return stop_action, {}
-        self.history.append(Observation(data=obs))
-        steps = self.history + [UserMessage(message=self.config.guidance)]
-        messages = [m for step in steps for m in step.to_messages()]
+        self.history += self.obs_to_messages(self.obs_preprocessor(obs))
         tools = [tool.model_dump() for tool in self.action_set.actions]
+        messages = self.history + [{"role": "user", "content": self.config.guidance}]
+
         try:
             logger.info(colored(f"Prompt:\n{pprint.pformat(messages, width=120)}", "blue"))
-            response: ModelResponse = self.llm(
-                tools=tools,
-                messages=messages,
-                num_retries=self.config.max_retry,
-            )
-            message = response.choices[0].message # type: ignore
+            response = self.llm(tools=tools, messages=messages)
+            message = response.choices[0].message  # type: ignore
         except Exception as e:
             logger.exception(f"Error getting LLM response: {e}. Prompt: {messages}")
             raise e
         logger.info(colored(f"LLM response:\n{pprint.pformat(message, width=120)}", "green"))
-        self.history.append(LLMOutput(message=message))
+
+        self.history.append(message)
         thoughts = self.thoughts_from_message(message)
         action = self.action_from_message(message)
-
         return action, {"think": thoughts}
 
     def thoughts_from_message(self, message) -> str:
@@ -187,7 +153,7 @@ def thoughts_from_message(self, message) -> str:
                     logger.info(colored(f"LLM thinking block:\n{thinking}", "yellow"))
                     thoughts.append(thinking)
         if message.content:
-            logger.info(colored(f"LLM output:\n{message.content}", "cyan"))
+            logger.info(colored(f"LLM text output:\n{message.content}", "cyan"))
             thoughts.append(message.content)
         return "\n\n".join(thoughts)
 
@@ -199,27 +165,27 @@ def action_from_message(self, message) -> ToolCallAction:
             assert isinstance(tool_call.function.name, str)
             try:
                 args = json.loads(tool_call.function.arguments)
-                action = ToolCallAction(
-                    id=tool_call.id,
-                    function=FunctionCall(name=tool_call.function.name, arguments=args)
-                )
             except json.JSONDecodeError as e:
-                logger.exception(f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}")
+                logger.exception(
+                    f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}"
+                )
                 raise e
-            
+            action = ToolCallAction(
+                id=tool_call.id, function=FunctionCall(name=tool_call.function.name, arguments=args)
+            )
             self.last_tool_call_id = action.id
+            logger.info(f"Parsed tool call action: {action}")
         else:
             raise ValueError(f"No tool call found in LLM response: {message}")
         return action
-    
+
 
 @dataclass
 class ReactToolCallAgentArgs(AgentArgs):
-    llm_args: LLMArgs = None # type: ignore
-    config: AgentConfig = None # type: ignore
+    llm_args: LLMArgs | None = None
+    config: AgentConfig | None = None
 
     def make_agent(self, actions: list[ToolSpec]) -> ReactToolCallAgent:
         llm = self.llm_args.make_model()
         action_set = ToolsActionSet(actions=actions)
         return ReactToolCallAgent(action_set=action_set, llm=llm, config=self.config)
-    
diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml
@@ -6,4 +6,4 @@ defaults:
 name: miniwob
 comment: MiniWob Agent
 parallel_backend: ray
-n_jobs: 16
+n_jobs: 8