From 4c2fd939d3f9f8ff830fe44d9f9719ff26d17d39 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 29 Aug 2025 20:00:16 -0700
Subject: [PATCH 1/7] Add OpenAI 2048 example and fix agent loop

---
 examples/openai_2048.py           | 127 ++++++++++++++++++++++++++++++
 hud/agents/misc/response_agent.py |   2 +-
 hud/agents/openai_chat_generic.py |  38 ++++++---
 3 files changed, 156 insertions(+), 11 deletions(-)
 create mode 100644 examples/openai_2048.py

diff --git a/examples/openai_2048.py b/examples/openai_2048.py
new file mode 100644
index 000000000..6775a2e5d
--- /dev/null
+++ b/examples/openai_2048.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Text 2048
+
+This example demonstrates using the OpenAIChatAgent with the text-2048 environment.
+It shows how to:
+- Initialize an OpenAI client with the openai_chat agent
+- Configure the text-2048 environment
+- Run the agent to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional)
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+from hud.agents.misc import ResponseAgent
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")  # Optional custom endpoint
+    api_key = os.getenv("OPENAI_API_KEY", "EMPTY")  # Default to "EMPTY" for local servers
+    
+    openai_client = AsyncOpenAI(
+        base_url=base_url,  # None will use default OpenAI endpoint
+        api_key=api_key,
+    )
+    
+    # Configure the text-2048 environment
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"]
+        }
+    }
+    
+    # Define the task with game setup and evaluation
+    task = Task(
+        prompt="""Play the 2048 game strategically. 
+        
+        Tips for high scores:
+        - Keep your highest tile in a corner (preferably bottom-right)
+        - Build tiles in descending order from that corner
+        - Avoid moving up unless absolutely necessary
+        - Try to keep tiles of similar values adjacent
+        
+        Use the 'move' tool with directions: up, down, left, or right.
+        Aim for the highest possible score!""",
+        mcp_config=mcp_config,
+        setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore
+        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+    
+    # Create OpenAI agent with the text-2048 game tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name="Qwen/Qwen2.5-3B-Instruct",
+        allowed_tools=["move"],
+        parallel_tool_calls=False,
+        response_agent=ResponseAgent(),
+        system_prompt="""You are an expert 2048 game player. 
+        Make strategic moves to achieve the highest score possible.
+        Always analyze the board state before making a move.""",
+    )
+
+    agent.metadata = {}
+
+    # Run the game with tracing
+    with hud.trace("OpenAI 2048 Game"):
+        try:
+            print("🎮 Starting 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print("="*50)
+            
+            # Run the task with unlimited steps (game ends when no moves available)
+            result = await agent.run(task, max_steps=-1)
+            
+            # Display results
+            print("="*50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+            # Display conversation history
+            print("🗣️ Conversation History:")
+            for i, msg in enumerate(agent.conversation_history):
+                print(f"  {i+1} : {msg}")
+                print("-"*30)
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    # Check for API configuration
+    if not os.getenv("OPENAI_API_KEY") and not os.getenv("OPENAI_BASE_URL"):
+        print("⚠️  Please configure OpenAI API access:")
+        print("   For OpenAI API: export OPENAI_API_KEY='your-api-key'")
+        print("   For local/custom endpoints: export OPENAI_BASE_URL='your-custom-endpoint'")
+        exit(1)
+    
+    # Display configuration
+    if os.getenv("OPENAI_BASE_URL"):
+        print(f"🔗 Using endpoint: {os.getenv('OPENAI_BASE_URL')}")
+    else:
+        print("🔗 Using default OpenAI API endpoint")
+    
+    asyncio.run(main())
\ No newline at end of file
diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py
index 4e7df2c2e..2b24b2113 100644
--- a/hud/agents/misc/response_agent.py
+++ b/hud/agents/misc/response_agent.py
@@ -54,7 +54,7 @@ async def determine_response(self, agent_message: str) -> ResponseType:
         """
         try:
             response = await self.client.chat.completions.create(
-                model="gpt-4o",
+                model="gpt-5-nano",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
                     {
diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index 847dc158d..ca9a01a13 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -21,6 +21,7 @@
 
 import mcp.types as types
 
+from hud import instrument
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 
 from .base import MCPAgent
@@ -52,6 +53,7 @@ def __init__(
         self.model_name = model_name
         self.parallel_tool_calls = parallel_tool_calls
         self.logprobs = logprobs
+        self.conversation_history = []
 
     @staticmethod
     def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
@@ -64,9 +66,7 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
 
     async def get_system_messages(self) -> list[Any]:
         """Get system messages for OpenAI."""
-        return [
-            {"role": "system", "content": self.system_prompt},
-        ]
+        return [{"role": "system", "content": self.system_prompt}]
 
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""
@@ -96,8 +96,14 @@ def get_tool_schemas(self) -> list[dict]:
             openai_tools.append(openai_tool)
         return openai_tools
 
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
     async def get_response(self, messages: list[Any]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
+        
         # Convert MCP tool schemas to OpenAI format
         mcp_schemas = self.get_tool_schemas()
 
@@ -111,6 +117,19 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
 
         choice = response.choices[0]
         msg = choice.message
+        
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+        
+        if msg.content:
+            assistant_msg["content"] = msg.content
+        
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+        
+        messages.append(assistant_msg)
+
+        # Store the complete conversation history
+        self.conversation_history = messages.copy()
 
         tool_calls = []
         if msg.tool_calls:
@@ -144,11 +163,10 @@ async def format_tool_results(
                     for c in res.content
                     if hasattr(c, "text")
                 )
-            rendered.append(
-                {
-                    "role": "tool",
-                    "tool_call_id": call.id,
-                    "content": content or "",  # Ensure content is never None
-                }
-            )
+            tool_msg = {
+                "role": "tool",
+                "tool_call_id": call.id,
+                "content": content or "",  # Ensure content is never None
+            }
+            rendered.append(tool_msg)
         return rendered

From c2488b5fb8ea40b1fbb797be1ccd4e76fd7c95fe Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Fri, 29 Aug 2025 20:05:18 -0700
Subject: [PATCH 2/7] update completion handling to consider 'length' as a
 valid termination reason

---
 hud/agents/openai_chat_generic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index ca9a01a13..a370ba00d 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -142,7 +142,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         return AgentResponse(
             content=msg.content or "",
             tool_calls=tool_calls,
-            done=choice.finish_reason == "stop",
+            done=choice.finish_reason in ("stop", "length"),
             raw=response,  # Include raw response for access to Choice objects
         )
 

From 38b90fa2ce0acab43fb089c03d419a779b0a138e Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Wed, 3 Sep 2025 22:48:44 -0700
Subject: [PATCH 3/7] examples

---
 examples/openai_2048.py         |  71 ++++++++---------
 examples/openai_browser_2048.py | 137 ++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+), 39 deletions(-)
 create mode 100644 examples/openai_browser_2048.py

diff --git a/examples/openai_2048.py b/examples/openai_2048.py
index 6775a2e5d..387b7dd5e 100644
--- a/examples/openai_2048.py
+++ b/examples/openai_2048.py
@@ -13,7 +13,7 @@
 - export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
 
 Environment Variables:
-- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional)
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint
 - OPENAI_API_KEY: API key for authentication
 """
 
@@ -25,39 +25,47 @@
 from hud.clients import MCPClient
 from hud.datasets import Task
 
-from hud.agents.misc import ResponseAgent
-
 
 async def main():
     # Initialize OpenAI client with environment variables
-    base_url = os.getenv("OPENAI_BASE_URL")  # Optional custom endpoint
-    api_key = os.getenv("OPENAI_API_KEY", "EMPTY")  # Default to "EMPTY" for local servers
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
     
     openai_client = AsyncOpenAI(
-        base_url=base_url,  # None will use default OpenAI endpoint
+        base_url=base_url if base_url else None,  # None will use default OpenAI endpoint
         api_key=api_key,
     )
     
-    # Configure the text-2048 environment
     mcp_config = {
         "local": {
             "command": "docker",
             "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"]
         }
     }
-    
+
+    system_prompt = """You are an expert 2048 game player. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+CRITICAL RULES:
+- ALWAYS analyze the board before moving
+- ALWAYS make a tool call for your move
+- Use the 'move' tool with these choices: "up", "down", "left", or "right"
+- Remember: ALL strings in JSON must have quotes!
+- Make exactly ONE move per turn
+- NEVER ask for permission - just keep playing until the game ends
+- Don't ask "Should I continue?" - just make your next move
+
+Example tool call: {"name": "move", "arguments": {"direction": "right"}}"""
+
     # Define the task with game setup and evaluation
     task = Task(
-        prompt="""Play the 2048 game strategically. 
-        
-        Tips for high scores:
-        - Keep your highest tile in a corner (preferably bottom-right)
-        - Build tiles in descending order from that corner
-        - Avoid moving up unless absolutely necessary
-        - Try to keep tiles of similar values adjacent
-        
-        Use the 'move' tool with directions: up, down, left, or right.
-        Aim for the highest possible score!""",
+        prompt="""Aim for the 128 tile (atleast 800 points!)""",
         mcp_config=mcp_config,
         setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore
         evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore
@@ -65,30 +73,27 @@ async def main():
 
     # Initialize MCP client
     client = MCPClient(mcp_config=task.mcp_config)
-    
+
+    model_name = "gpt-5-mini" # Replace with your model name
+
     # Create OpenAI agent with the text-2048 game tools
     agent = GenericOpenAIChatAgent(
         mcp_client=client,
         openai_client=openai_client,
-        model_name="Qwen/Qwen2.5-3B-Instruct",
+        model_name=model_name,
         allowed_tools=["move"],
         parallel_tool_calls=False,
-        response_agent=ResponseAgent(),
-        system_prompt="""You are an expert 2048 game player. 
-        Make strategic moves to achieve the highest score possible.
-        Always analyze the board state before making a move.""",
+        system_prompt=system_prompt,
     )
 
     agent.metadata = {}
 
-    # Run the game with tracing
     with hud.trace("OpenAI 2048 Game"):
         try:
             print("🎮 Starting 2048 game with OpenAI agent...")
             print(f"🤖 Model: {agent.model_name}")
             print("="*50)
             
-            # Run the task with unlimited steps (game ends when no moves available)
             result = await agent.run(task, max_steps=-1)
             
             # Display results
@@ -111,17 +116,5 @@ async def main():
 
 
 if __name__ == "__main__":
-    # Check for API configuration
-    if not os.getenv("OPENAI_API_KEY") and not os.getenv("OPENAI_BASE_URL"):
-        print("⚠️  Please configure OpenAI API access:")
-        print("   For OpenAI API: export OPENAI_API_KEY='your-api-key'")
-        print("   For local/custom endpoints: export OPENAI_BASE_URL='your-custom-endpoint'")
-        exit(1)
-    
-    # Display configuration
-    if os.getenv("OPENAI_BASE_URL"):
-        print(f"🔗 Using endpoint: {os.getenv('OPENAI_BASE_URL')}")
-    else:
-        print("🔗 Using default OpenAI API endpoint")
-    
+
     asyncio.run(main())
\ No newline at end of file
diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py
new file mode 100644
index 000000000..ff41ac8e6
--- /dev/null
+++ b/examples/openai_browser_2048.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Browser 2048
+
+This example demonstrates using the OpenAIChatAgent with the browser-based 2048 game.
+It shows how to:
+- Initialize an OpenAI client with browser automation capabilities
+- Configure the browser-2048 environment with Docker
+- Use computer vision and interaction tools to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+- Docker installed and running
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional)
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
+    
+    openai_client = AsyncOpenAI(
+        base_url=base_url if base_url else None,
+        api_key=api_key,
+    )
+    
+    # Configure the browser-2048 environment
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"]
+        }
+    }
+
+    system_prompt = """You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+BROWSER INTERACTION USING THE COMPUTER TOOL:
+1. TAKE SCREENSHOTS:
+   Use: computer(action="screenshot")
+   This captures the current game state
+
+2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press":
+   - Move UP: computer(action="press", keys=["up"])
+   - Move DOWN: computer(action="press", keys=["down"]) 
+   - Move LEFT: computer(action="press", keys=["left"])
+   - Move RIGHT: computer(action="press", keys=["right"])
+
+CRITICAL RULES:
+- Take a screenshot first to see the board state at the start of the game
+- Make exactly ONE move per turn using the press action with arrow keys
+- Continue playing until you reach the target or the game ends
+
+Strategy tips:
+- Keep your highest tiles in a corner
+- Build tiles in descending order from the corner
+- Avoid random moves - be strategic
+- Try to keep the board organized"""
+
+    # Define the task with browser game setup and evaluation
+    task = Task(
+        prompt="""Play the browser-based 2048 game and try to reach the 128 tile.
+        
+        Take screenshots to see the game board, then make strategic moves using the browser interface.
+        You can use arrow keys or mouse gestures to move tiles.""",
+        mcp_config=mcp_config,
+        setup_tool={
+            "name": "launch_app",
+            "arguments": {"app_name": "2048"}
+        },  # type: ignore
+        evaluate_tool={
+            "name": "evaluate",
+            "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
+        },  # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    model_name = "z-ai/glm-4.5v"
+
+    # Create OpenAI agent with browser automation tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name=model_name,
+        allowed_tools=["computer"],  # Computer tool for browser automation
+        parallel_tool_calls=False,
+        system_prompt=system_prompt,
+    )
+
+    agent.metadata = {}
+
+    # Run the game with tracing
+    with hud.trace("OpenAI Browser 2048 Game"):
+        try:
+            print("🎮 Starting browser-based 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print(f"🌐 Browser environment running on localhost:8080")
+            print("="*50)
+            
+            result = await agent.run(task, max_steps=10)
+            
+            # Display results
+            print("="*50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file

From 2f974ef530b46bece3f82081348c2aa7b3a63336 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Wed, 3 Sep 2025 22:55:58 -0700
Subject: [PATCH 4/7] ruff

---
 examples/openai_2048.py           | 30 ++++++++++++++++--------------
 examples/openai_browser_2048.py   | 25 +++++++++++--------------
 hud/agents/openai_chat_generic.py | 10 +++++-----
 hud/cli/eval.py                   |  6 ++----
 hud/cli/init.py                   | 12 ++++++------
 hud/clients/fastmcp.py            |  9 +++++++--
 hud/clients/mcp_use.py            |  4 +++-
 hud/otel/instrumentation.py       |  3 ++-
 8 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/examples/openai_2048.py b/examples/openai_2048.py
index 387b7dd5e..000248b06 100644
--- a/examples/openai_2048.py
+++ b/examples/openai_2048.py
@@ -30,16 +30,16 @@ async def main():
     # Initialize OpenAI client with environment variables
     base_url = os.getenv("OPENAI_BASE_URL")
     api_key = os.getenv("OPENAI_API_KEY")
-    
+
     openai_client = AsyncOpenAI(
         base_url=base_url if base_url else None,  # None will use default OpenAI endpoint
         api_key=api_key,
     )
-    
+
     mcp_config = {
         "local": {
             "command": "docker",
-            "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"]
+            "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"],
         }
     }
 
@@ -67,14 +67,17 @@ async def main():
     task = Task(
         prompt="""Aim for the 128 tile (atleast 800 points!)""",
         mcp_config=mcp_config,
-        setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore
-        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore
+        setup_tool={
+            "name": "setup",
+            "arguments": {"name": "board", "arguments": {"board_size": 4}},
+        },  # type: ignore
+        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}},  # type: ignore
     )
 
     # Initialize MCP client
     client = MCPClient(mcp_config=task.mcp_config)
 
-    model_name = "gpt-5-mini" # Replace with your model name
+    model_name = "gpt-5-mini"  # Replace with your model name
 
     # Create OpenAI agent with the text-2048 game tools
     agent = GenericOpenAIChatAgent(
@@ -92,12 +95,12 @@ async def main():
         try:
             print("🎮 Starting 2048 game with OpenAI agent...")
             print(f"🤖 Model: {agent.model_name}")
-            print("="*50)
-            
+            print("=" * 50)
+
             result = await agent.run(task, max_steps=-1)
-            
+
             # Display results
-            print("="*50)
+            print("=" * 50)
             print(f"✅ Game completed!")
             print(f"🏆 Final Score/Max Tile: {result.reward}")
             if result.info:
@@ -106,8 +109,8 @@ async def main():
             # Display conversation history
             print("🗣️ Conversation History:")
             for i, msg in enumerate(agent.conversation_history):
-                print(f"  {i+1} : {msg}")
-                print("-"*30)
+                print(f"  {i + 1} : {msg}")
+                print("-" * 30)
 
         except Exception as e:
             print(f"❌ Error during game: {e}")
@@ -116,5 +119,4 @@ async def main():
 
 
 if __name__ == "__main__":
-
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py
index ff41ac8e6..c25a5f0e1 100644
--- a/examples/openai_browser_2048.py
+++ b/examples/openai_browser_2048.py
@@ -31,17 +31,17 @@ async def main():
     # Initialize OpenAI client with environment variables
     base_url = os.getenv("OPENAI_BASE_URL")
     api_key = os.getenv("OPENAI_API_KEY")
-    
+
     openai_client = AsyncOpenAI(
         base_url=base_url if base_url else None,
         api_key=api_key,
     )
-    
+
     # Configure the browser-2048 environment
     mcp_config = {
         "local": {
             "command": "docker",
-            "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"]
+            "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
         }
     }
 
@@ -83,10 +83,7 @@ async def main():
         Take screenshots to see the game board, then make strategic moves using the browser interface.
         You can use arrow keys or mouse gestures to move tiles.""",
         mcp_config=mcp_config,
-        setup_tool={
-            "name": "launch_app",
-            "arguments": {"app_name": "2048"}
-        },  # type: ignore
+        setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}},  # type: ignore
         evaluate_tool={
             "name": "evaluate",
             "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
@@ -96,14 +93,14 @@ async def main():
     # Initialize MCP client
     client = MCPClient(mcp_config=task.mcp_config)
 
-    model_name = "z-ai/glm-4.5v"
+    model_name = "gpt-5-mini"  # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc...
 
     # Create OpenAI agent with browser automation tools
     agent = GenericOpenAIChatAgent(
         mcp_client=client,
         openai_client=openai_client,
         model_name=model_name,
-        allowed_tools=["computer"],  # Computer tool for browser automation
+        allowed_tools=["computer"],
         parallel_tool_calls=False,
         system_prompt=system_prompt,
     )
@@ -116,12 +113,12 @@ async def main():
             print("🎮 Starting browser-based 2048 game with OpenAI agent...")
             print(f"🤖 Model: {agent.model_name}")
             print(f"🌐 Browser environment running on localhost:8080")
-            print("="*50)
-            
+            print("=" * 50)
+
             result = await agent.run(task, max_steps=10)
-            
+
             # Display results
-            print("="*50)
+            print("=" * 50)
             print(f"✅ Game completed!")
             print(f"🏆 Final Score/Max Tile: {result.reward}")
             if result.info:
@@ -134,4 +131,4 @@ async def main():
 
 
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index a370ba00d..3562d97e7 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -103,7 +103,7 @@ def get_tool_schemas(self) -> list[dict]:
     )
     async def get_response(self, messages: list[Any]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
-        
+
         # Convert MCP tool schemas to OpenAI format
         mcp_schemas = self.get_tool_schemas()
 
@@ -117,15 +117,15 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
 
         choice = response.choices[0]
         msg = choice.message
-        
+
         assistant_msg: dict[str, Any] = {"role": "assistant"}
-        
+
         if msg.content:
             assistant_msg["content"] = msg.content
-        
+
         if msg.tool_calls:
             assistant_msg["tool_calls"] = msg.tool_calls
-        
+
         messages.append(assistant_msg)
 
         # Store the complete conversation history
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 92626cbc7..c090cf4a8 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -111,8 +111,7 @@ async def run_single_task(
                     )
                     raise typer.Exit(1) from e
 
-                agent_config: dict[str, Any] = {
-                }
+                agent_config: dict[str, Any] = {}
                 if allowed_tools:
                     agent_config["allowed_tools"] = allowed_tools
 
@@ -240,8 +239,7 @@ async def run_full_dataset(
             )
             raise typer.Exit(1) from e
 
-        agent_config: dict[str, Any] = {
-        }
+        agent_config: dict[str, Any] = {}
         if allowed_tools:
             agent_config["allowed_tools"] = allowed_tools
 
diff --git a/hud/cli/init.py b/hud/cli/init.py
index c7404e088..a8de9464f 100644
--- a/hud/cli/init.py
+++ b/hud/cli/init.py
@@ -139,7 +139,7 @@ async def evaluate(target: int = 10) -> EvaluationResult:
     mcp.run()
 '''
 
-TASKS_JSON_TEMPLATE = '''[
+TASKS_JSON_TEMPLATE = """[
   {{
     "prompt": "Increment the counter to reach 10",
     "mcp_config": {{
@@ -159,7 +159,7 @@ async def evaluate(target: int = 10) -> EvaluationResult:
     }}
   }}
 ]
-'''
+"""
 
 TEST_TASK_TEMPLATE = '''#!/usr/bin/env python
 """Simple example of running tasks from tasks.json.
@@ -210,7 +210,7 @@ async def main():
     asyncio.run(main())
 '''
 
-NOTEBOOK_TEMPLATE = '''{{
+NOTEBOOK_TEMPLATE = """{{
  "cells": [
   {{
    "cell_type": "markdown",
@@ -427,9 +427,9 @@ async def main():
  "nbformat": 4,
  "nbformat_minor": 4
 }}
-'''
+"""
 
-README_TEMPLATE = '''# {title}
+README_TEMPLATE = """# {title}
 
 A minimal HUD environment demonstrating the Task pattern with a simple counter.
 
@@ -510,7 +510,7 @@ async def main():
 **Note**: Only public HuggingFace datasets appear as leaderboards!
 
 📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
-'''
+"""
 
 
 def sanitize_name(name: str) -> str:
diff --git a/hud/clients/fastmcp.py b/hud/clients/fastmcp.py
index 56b8bb223..c9f8b992d 100644
--- a/hud/clients/fastmcp.py
+++ b/hud/clients/fastmcp.py
@@ -106,8 +106,13 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
 
             # Configure validation for output schemas based on client setting
             try:
-                if hasattr(self._client, "_session_state") and self._client._session_state.session is not None:  # noqa: E501
-                    self._client._session_state.session._validate_structured_outputs = self._strict_validation  # noqa: E501
+                if (
+                    hasattr(self._client, "_session_state")
+                    and self._client._session_state.session is not None
+                ):  # noqa: E501
+                    self._client._session_state.session._validate_structured_outputs = (
+                        self._strict_validation
+                    )  # noqa: E501
             except ImportError:
                 pass
 
diff --git a/hud/clients/mcp_use.py b/hud/clients/mcp_use.py
index b81b714e2..2f9e94ce0 100644
--- a/hud/clients/mcp_use.py
+++ b/hud/clients/mcp_use.py
@@ -79,7 +79,9 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
                         and hasattr(session.connector, "client_session")
                         and session.connector.client_session is not None
                     ):
-                        session.connector.client_session._validate_structured_outputs = self._strict_validation  # noqa: E501
+                        session.connector.client_session._validate_structured_outputs = (
+                            self._strict_validation
+                        )  # noqa: E501
             except ImportError:
                 # ValidationOptions may not be available in some mcp versions
                 pass
diff --git a/hud/otel/instrumentation.py b/hud/otel/instrumentation.py
index c7c72b10d..ad30f5d37 100644
--- a/hud/otel/instrumentation.py
+++ b/hud/otel/instrumentation.py
@@ -32,8 +32,9 @@ def install_mcp_instrumentation(provider: TracerProvider) -> None:
     try:
         # First, patch the _instruments to use our fork
         import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
+
         mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",)
-        
+
         from opentelemetry.instrumentation.mcp.instrumentation import (
             McpInstrumentor,
         )

From 5e4d74534bd23cdce8c49e2c6e85ceae7688052c Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Thu, 4 Sep 2025 17:35:56 -0700
Subject: [PATCH 5/7] fix: image handling in OpenAI chat agent

---
 examples/openai_2048.py           |   6 +-
 examples/openai_browser_2048.py   |  23 +++--
 hud/agents/openai_chat_generic.py | 140 ++++++++++++++++++++++++------
 3 files changed, 132 insertions(+), 37 deletions(-)

diff --git a/examples/openai_2048.py b/examples/openai_2048.py
index 000248b06..37b1cc863 100644
--- a/examples/openai_2048.py
+++ b/examples/openai_2048.py
@@ -65,13 +65,13 @@ async def main():
 
     # Define the task with game setup and evaluation
     task = Task(
-        prompt="""Aim for the 128 tile (atleast 800 points!)""",
+        prompt="""Aim for the 128 tile (atleast a score of 800!)""",
         mcp_config=mcp_config,
         setup_tool={
             "name": "setup",
             "arguments": {"name": "board", "arguments": {"board_size": 4}},
         },  # type: ignore
-        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}},  # type: ignore
+        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 128}}},  # type: ignore
     )
 
     # Initialize MCP client
@@ -97,7 +97,7 @@ async def main():
             print(f"🤖 Model: {agent.model_name}")
             print("=" * 50)
 
-            result = await agent.run(task, max_steps=-1)
+            result = await agent.run(task, max_steps=100)
 
             # Display results
             print("=" * 50)
diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py
index c25a5f0e1..e9a3723c9 100644
--- a/examples/openai_browser_2048.py
+++ b/examples/openai_browser_2048.py
@@ -55,9 +55,10 @@ async def main():
 - Game ends when grid is full and no merges possible
 
 BROWSER INTERACTION USING THE COMPUTER TOOL:
-1. TAKE SCREENSHOTS:
+1. FIRST TURN ONLY - TAKE SCREENSHOT:
    Use: computer(action="screenshot")
-   This captures the current game state
+   This captures the initial game state. Only needed for your first turn.
+   After that, the environment will automatically return an image with each successful move.
 
 2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press":
    - Move UP: computer(action="press", keys=["up"])
@@ -66,9 +67,8 @@ async def main():
    - Move RIGHT: computer(action="press", keys=["right"])
 
 CRITICAL RULES:
-- Take a screenshot first to see the board state at the start of the game
 - Make exactly ONE move per turn using the press action with arrow keys
-- Continue playing until you reach the target or the game ends
+- Continue playing until you reach the target or the game ends, no need to ask the user for confirmation.
 
 Strategy tips:
 - Keep your highest tiles in a corner
@@ -79,9 +79,9 @@ async def main():
     # Define the task with browser game setup and evaluation
     task = Task(
         prompt="""Play the browser-based 2048 game and try to reach the 128 tile.
-        
-        Take screenshots to see the game board, then make strategic moves using the browser interface.
-        You can use arrow keys or mouse gestures to move tiles.""",
+
+        Start by taking a screenshot to see the initial game board, then make strategic moves using arrow keys.
+        After your first screenshot, the game board will be automatically shown after each successful move.""",
         mcp_config=mcp_config,
         setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}},  # type: ignore
         evaluate_tool={
@@ -93,7 +93,7 @@ async def main():
     # Initialize MCP client
     client = MCPClient(mcp_config=task.mcp_config)
 
-    model_name = "gpt-5-mini"  # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc...
+    model_name = "z-ai/glm-4.5v"  # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc...
 
     # Create OpenAI agent with browser automation tools
     agent = GenericOpenAIChatAgent(
@@ -115,7 +115,7 @@ async def main():
             print(f"🌐 Browser environment running on localhost:8080")
             print("=" * 50)
 
-            result = await agent.run(task, max_steps=10)
+            result = await agent.run(task, max_steps=100)
 
             # Display results
             print("=" * 50)
@@ -124,6 +124,11 @@ async def main():
             if result.info:
                 print(f"📊 Game Stats: {result.info}")
 
+            print("\n📝 Full interaction trace:")
+            for i, msg in enumerate(agent.conversation_history):
+                print(f"  {i + 1} : {msg}")
+                print("-" * 30)
+
         except Exception as e:
             print(f"❌ Error during game: {e}")
         finally:
diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index 3562d97e7..f4a1aec5e 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -70,27 +70,84 @@ async def get_system_messages(self) -> list[Any]:
 
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""
-        return [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": block.text}
-                    for block in blocks
-                    if isinstance(block, types.TextContent)
-                ],
-            },
-        ]
+        content = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                content.append({"type": "text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                content.append({
+                    "type": "image_url",
+                    "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"}
+                })
+        
+        return [{"role": "user", "content": content}]
+
+    def _sanitize_schema_for_openai(self, schema: dict) -> dict:
+        """Convert MCP JSON Schema to OpenAI-compatible format.
+        
+        Handles unsupported features like anyOf and prefixItems.
+        """
+        if not isinstance(schema, dict):
+            return schema
+            
+        sanitized = {}
+        
+        for key, value in schema.items():
+            if key == "anyOf" and isinstance(value, list):
+                # Handle anyOf patterns (usually for nullable fields)
+                non_null_types = [v for v in value if not (isinstance(v, dict) and v.get("type") == "null")]
+                if non_null_types:
+                    # Use the first non-null type
+                    sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
+                else:
+                    sanitized["type"] = "string"  # Fallback
+                    
+            elif key == "prefixItems":
+                # Convert prefixItems to simple items
+                sanitized["type"] = "array"
+                if isinstance(value, list) and value:
+                    # Use the type from the first item as the items schema
+                    first_item = value[0]
+                    if isinstance(first_item, dict):
+                        sanitized["items"] = {"type": first_item.get("type", "string")}
+                    else:
+                        sanitized["items"] = {"type": "string"}
+                        
+            elif key == "properties" and isinstance(value, dict):
+                # Recursively sanitize property schemas
+                sanitized[key] = {
+                    prop_name: self._sanitize_schema_for_openai(prop_schema)
+                    for prop_name, prop_schema in value.items()
+                }
+                
+            elif key == "items" and isinstance(value, dict):
+                # Recursively sanitize items schema
+                sanitized[key] = self._sanitize_schema_for_openai(value)
+                
+            elif key in ("type", "description", "enum", "required", "default",
+                        "minimum", "maximum", "minItems", "maxItems"):
+                # These are supported by OpenAI
+                sanitized[key] = value
+                
+        return sanitized or {"type": "object"}
 
     def get_tool_schemas(self) -> list[dict]:
         tool_schemas = super().get_tool_schemas()
         openai_tools = []
         for schema in tool_schemas:
+            parameters = schema.get("parameters", {})
+            
+            if parameters:
+                sanitized_params = self._sanitize_schema_for_openai(parameters)
+            else:
+                sanitized_params = {"type": "object", "properties": {}}
+                
             openai_tool = {
                 "type": "function",
                 "function": {
                     "name": schema["name"],
                     "description": schema.get("description", ""),
-                    "parameters": schema.get("parameters", {"type": "object", "properties": {}}),
+                    "parameters": sanitized_params,
                 },
             }
             openai_tools.append(openai_tool)
@@ -151,22 +208,55 @@ async def format_tool_results(
         tool_calls: list[MCPToolCall],
         tool_results: list[MCPToolResult],
     ) -> list[Any]:
-        """Render MCP tool results as OpenAI ``role=tool`` messages."""
+        """Render MCP tool results as OpenAI messages.
+        
+        Note: OpenAI tool messages only support string content.
+        When images are present, we return both a tool message and a user message.
+        """
         rendered: list[dict[str, Any]] = []
         for call, res in zip(tool_calls, tool_results, strict=False):
-            if res.structuredContent:
-                content = json.dumps(res.structuredContent)
-            else:
-                # Concatenate any TextContent blocks
-                content = "".join(
-                    c.text  # type: ignore[attr-defined]
-                    for c in res.content
-                    if hasattr(c, "text")
-                )
-            tool_msg = {
+            # Use structuredContent.result if available, otherwise use content
+            items = res.content
+            if res.structuredContent and isinstance(res.structuredContent, dict):
+                items = res.structuredContent.get("result", res.content)
+            
+            # Separate text and image content
+            text_parts = []
+            image_parts = []
+            
+            for item in items:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                    elif item.get("type") == "image":
+                        image_parts.append({
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}"
+                            }
+                        })
+                elif isinstance(item, types.TextContent):
+                    text_parts.append(item.text)
+                elif isinstance(item, types.ImageContent):
+                    image_parts.append({
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"}
+                    })
+            
+            text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
+            rendered.append({
                 "role": "tool",
                 "tool_call_id": call.id,
-                "content": content or "",  # Ensure content is never None
-            }
-            rendered.append(tool_msg)
+                "content": text_content,
+            })
+            
+            # If there are images, add them as a separate user message
+            if image_parts:
+                # Add a user message with the images
+                content_with_images = [{"type": "text", "text": "Tool returned the following:"}] + image_parts
+                rendered.append({
+                    "role": "user",
+                    "content": content_with_images,
+                })
+            
         return rendered

From ceb4f8f1277071c84f933d8cea6057daa72aa481 Mon Sep 17 00:00:00 2001
From: Jaideep <jdchawla29@gmail.com>
Date: Thu, 4 Sep 2025 17:40:25 -0700
Subject: [PATCH 6/7] ruff

---
 examples/openai_2048.py           |   5 +-
 hud/agents/openai_chat_generic.py | 109 ++++++++++++++++++------------
 2 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/examples/openai_2048.py b/examples/openai_2048.py
index 37b1cc863..f83aab590 100644
--- a/examples/openai_2048.py
+++ b/examples/openai_2048.py
@@ -71,7 +71,10 @@ async def main():
             "name": "setup",
             "arguments": {"name": "board", "arguments": {"board_size": 4}},
         },  # type: ignore
-        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 128}}},  # type: ignore
+        evaluate_tool={
+            "name": "evaluate",
+            "arguments": {"name": "max_number", "arguments": {"target": 128}},
+        },  # type: ignore
     )
 
     # Initialize MCP client
diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index f4a1aec5e..8e61be940 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -75,33 +75,37 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
             if isinstance(block, types.TextContent):
                 content.append({"type": "text", "text": block.text})
             elif isinstance(block, types.ImageContent):
-                content.append({
-                    "type": "image_url",
-                    "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"}
-                })
-        
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
+                    }
+                )
+
         return [{"role": "user", "content": content}]
 
     def _sanitize_schema_for_openai(self, schema: dict) -> dict:
         """Convert MCP JSON Schema to OpenAI-compatible format.
-        
+
         Handles unsupported features like anyOf and prefixItems.
         """
         if not isinstance(schema, dict):
             return schema
-            
+
         sanitized = {}
-        
+
         for key, value in schema.items():
             if key == "anyOf" and isinstance(value, list):
                 # Handle anyOf patterns (usually for nullable fields)
-                non_null_types = [v for v in value if not (isinstance(v, dict) and v.get("type") == "null")]
+                non_null_types = [
+                    v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
+                ]
                 if non_null_types:
                     # Use the first non-null type
                     sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
                 else:
                     sanitized["type"] = "string"  # Fallback
-                    
+
             elif key == "prefixItems":
                 # Convert prefixItems to simple items
                 sanitized["type"] = "array"
@@ -112,23 +116,32 @@ def _sanitize_schema_for_openai(self, schema: dict) -> dict:
                         sanitized["items"] = {"type": first_item.get("type", "string")}
                     else:
                         sanitized["items"] = {"type": "string"}
-                        
+
             elif key == "properties" and isinstance(value, dict):
                 # Recursively sanitize property schemas
                 sanitized[key] = {
                     prop_name: self._sanitize_schema_for_openai(prop_schema)
                     for prop_name, prop_schema in value.items()
                 }
-                
+
             elif key == "items" and isinstance(value, dict):
                 # Recursively sanitize items schema
                 sanitized[key] = self._sanitize_schema_for_openai(value)
-                
-            elif key in ("type", "description", "enum", "required", "default",
-                        "minimum", "maximum", "minItems", "maxItems"):
+
+            elif key in (
+                "type",
+                "description",
+                "enum",
+                "required",
+                "default",
+                "minimum",
+                "maximum",
+                "minItems",
+                "maxItems",
+            ):
                 # These are supported by OpenAI
                 sanitized[key] = value
-                
+
         return sanitized or {"type": "object"}
 
     def get_tool_schemas(self) -> list[dict]:
@@ -136,12 +149,12 @@ def get_tool_schemas(self) -> list[dict]:
         openai_tools = []
         for schema in tool_schemas:
             parameters = schema.get("parameters", {})
-            
+
             if parameters:
                 sanitized_params = self._sanitize_schema_for_openai(parameters)
             else:
                 sanitized_params = {"type": "object", "properties": {}}
-                
+
             openai_tool = {
                 "type": "function",
                 "function": {
@@ -209,7 +222,7 @@ async def format_tool_results(
         tool_results: list[MCPToolResult],
     ) -> list[Any]:
         """Render MCP tool results as OpenAI messages.
-        
+
         Note: OpenAI tool messages only support string content.
         When images are present, we return both a tool message and a user message.
         """
@@ -219,44 +232,54 @@ async def format_tool_results(
             items = res.content
             if res.structuredContent and isinstance(res.structuredContent, dict):
                 items = res.structuredContent.get("result", res.content)
-            
+
             # Separate text and image content
             text_parts = []
             image_parts = []
-            
+
             for item in items:
                 if isinstance(item, dict):
                     if item.get("type") == "text":
                         text_parts.append(item.get("text", ""))
                     elif item.get("type") == "image":
-                        image_parts.append({
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}"
+                        image_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}"
+                                },
                             }
-                        })
+                        )
                 elif isinstance(item, types.TextContent):
                     text_parts.append(item.text)
                 elif isinstance(item, types.ImageContent):
-                    image_parts.append({
-                        "type": "image_url",
-                        "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"}
-                    })
-            
+                    image_parts.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
+                        }
+                    )
+
             text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
-            rendered.append({
-                "role": "tool",
-                "tool_call_id": call.id,
-                "content": text_content,
-            })
-            
+            rendered.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": call.id,
+                    "content": text_content,
+                }
+            )
+
             # If there are images, add them as a separate user message
             if image_parts:
                 # Add a user message with the images
-                content_with_images = [{"type": "text", "text": "Tool returned the following:"}] + image_parts
-                rendered.append({
-                    "role": "user",
-                    "content": content_with_images,
-                })
-            
+                content_with_images = [
+                    {"type": "text", "text": "Tool returned the following:"}
+                ] + image_parts
+                rendered.append(
+                    {
+                        "role": "user",
+                        "content": content_with_images,
+                    }
+                )
+
         return rendered

From 585d0dbc359a37661a8a7118c344a8a199ca7b05 Mon Sep 17 00:00:00 2001
From: "Parth A. Patel" <parthpatel0220@gmail.com>
Date: Thu, 4 Sep 2025 17:55:22 -0700
Subject: [PATCH 7/7] nit: fix ruff checks

---
 hud/datasets/execution/parallel.py | 10 +++++-----
 hud/otel/exporters.py              |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hud/datasets/execution/parallel.py b/hud/datasets/execution/parallel.py
index dfa385219..45311b915 100644
--- a/hud/datasets/execution/parallel.py
+++ b/hud/datasets/execution/parallel.py
@@ -75,8 +75,8 @@ def _process_worker(
         pass
 
     # Set up signal handler for clean interruption
-    def signal_handler(signum, frame):
-        logger.warning(f"Worker {worker_id}: Received interrupt signal")
+    def signal_handler(signum: int, frame: Any) -> None:
+        logger.warning("Worker %s: Received interrupt signal", worker_id)
         # Raise KeyboardInterrupt to actually interrupt the worker
         raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
 
@@ -171,7 +171,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in
             results = await asyncio.gather(*tasks, return_exceptions=False)
             return results
         except asyncio.CancelledError:
-            logger.info(f"Worker {worker_id}: Tasks cancelled due to interruption")
+            logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
             # Return error results for all tasks
             return [
                 (
@@ -208,7 +208,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in
 
         return results
     except KeyboardInterrupt:
-        logger.info(f"Worker {worker_id}: Interrupted by user, stopping gracefully")
+        logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
         # Return partial results for tasks that completed
         partial_results = []
         for idx, _ in task_batch:
@@ -489,7 +489,7 @@ async def run_dataset_parallel_manual(
                             "content": "Task interrupted (Ctrl+C)",
                         }
 
-                logger.info(f"Interrupted after {completed}/{total} tasks")
+                logger.info("Interrupted after %s/%s tasks", completed, total)
                 raise  # Re-raise to propagate the interrupt
 
         finally:
diff --git a/hud/otel/exporters.py b/hud/otel/exporters.py
index 4d5dd9a1e..82e2ac954 100644
--- a/hud/otel/exporters.py
+++ b/hud/otel/exporters.py
@@ -14,9 +14,9 @@
 import contextlib
 import json
 import logging
+import time
 from collections import defaultdict
 from datetime import UTC, datetime
-import time
 from typing import TYPE_CHECKING, Any
 
 from mcp.types import ClientRequest, ServerResult