hud-evals · jdchawla29 · Sep 5, 2025 · Aug 30, 2025 · Aug 30, 2025 · Sep 4, 2025
diff --git a/examples/openai_2048.py b/examples/openai_2048.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Text 2048
+
+This example demonstrates using the OpenAIChatAgent with the text-2048 environment.
+It shows how to:
+- Initialize an OpenAI client with the openai_chat agent
+- Configure the text-2048 environment
+- Run the agent to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
+
+    openai_client = AsyncOpenAI(
+        base_url=base_url if base_url else None,  # None will use default OpenAI endpoint
+        api_key=api_key,
+    )
+
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"],
+        }
+    }
+
+    system_prompt = """You are an expert 2048 game player. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+CRITICAL RULES:
+- ALWAYS analyze the board before moving
+- ALWAYS make a tool call for your move
+- Use the 'move' tool with these choices: "up", "down", "left", or "right"
+- Remember: ALL strings in JSON must have quotes!
+- Make exactly ONE move per turn
+- NEVER ask for permission - just keep playing until the game ends
+- Don't ask "Should I continue?" - just make your next move
+
+Example tool call: {"name": "move", "arguments": {"direction": "right"}}"""
+
+    # Define the task with game setup and evaluation
+    task = Task(
+        prompt="""Aim for the 128 tile (atleast 800 points!)""",
+        mcp_config=mcp_config,
+        setup_tool={
+            "name": "setup",
+            "arguments": {"name": "board", "arguments": {"board_size": 4}},
+        },  # type: ignore
+        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}},  # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    model_name = "gpt-5-mini"  # Replace with your model name
+
+    # Create OpenAI agent with the text-2048 game tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name=model_name,
+        allowed_tools=["move"],
+        parallel_tool_calls=False,
+        system_prompt=system_prompt,
+    )
+
+    agent.metadata = {}
+
+    with hud.trace("OpenAI 2048 Game"):
+        try:
+            print("🎮 Starting 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print("=" * 50)
+
+            result = await agent.run(task, max_steps=-1)
+
+            # Display results
+            print("=" * 50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+            # Display conversation history
+            print("🗣️ Conversation History:")
+            for i, msg in enumerate(agent.conversation_history):
+                print(f"  {i + 1} : {msg}")
+                print("-" * 30)
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Browser 2048
+
+This example demonstrates using the OpenAIChatAgent with the browser-based 2048 game.
+It shows how to:
+- Initialize an OpenAI client with browser automation capabilities
+- Configure the browser-2048 environment with Docker
+- Use computer vision and interaction tools to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+- Docker installed and running
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional)
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
+
+    openai_client = AsyncOpenAI(
+        base_url=base_url if base_url else None,
+        api_key=api_key,
+    )
+
+    # Configure the browser-2048 environment
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
+        }
+    }
+
+    system_prompt = """You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+BROWSER INTERACTION USING THE COMPUTER TOOL:
+1. TAKE SCREENSHOTS:
+   Use: computer(action="screenshot")
+   This captures the current game state
+
+2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press":
+   - Move UP: computer(action="press", keys=["up"])
+   - Move DOWN: computer(action="press", keys=["down"]) 
+   - Move LEFT: computer(action="press", keys=["left"])
+   - Move RIGHT: computer(action="press", keys=["right"])
+
+CRITICAL RULES:
+- Take a screenshot first to see the board state at the start of the game
+- Make exactly ONE move per turn using the press action with arrow keys
+- Continue playing until you reach the target or the game ends
+
+Strategy tips:
+- Keep your highest tiles in a corner
+- Build tiles in descending order from the corner
+- Avoid random moves - be strategic
+- Try to keep the board organized"""
+
+    # Define the task with browser game setup and evaluation
+    task = Task(
+        prompt="""Play the browser-based 2048 game and try to reach the 128 tile.
+
+        Take screenshots to see the game board, then make strategic moves using the browser interface.
+        You can use arrow keys or mouse gestures to move tiles.""",
+        mcp_config=mcp_config,
+        setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}},  # type: ignore
+        evaluate_tool={
+            "name": "evaluate",
+            "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
+        },  # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    model_name = "gpt-5-mini"  # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc...
+
+    # Create OpenAI agent with browser automation tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name=model_name,
+        allowed_tools=["computer"],
+        parallel_tool_calls=False,
+        system_prompt=system_prompt,
+    )
+
+    agent.metadata = {}
+
+    # Run the game with tracing
+    with hud.trace("OpenAI Browser 2048 Game"):
+        try:
+            print("🎮 Starting browser-based 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print(f"🌐 Browser environment running on localhost:8080")
+            print("=" * 50)
+
+            result = await agent.run(task, max_steps=10)
+
+            # Display results
+            print("=" * 50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py
@@ -54,7 +54,7 @@ async def determine_response(self, agent_message: str) -> ResponseType:
         """
         try:
             response = await self.client.chat.completions.create(
-                model="gpt-4o",
+                model="gpt-5-nano",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
                     {

diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
@@ -21,6 +21,7 @@
 
 import mcp.types as types
 
+from hud import instrument
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 
 from .base import MCPAgent
@@ -52,6 +53,7 @@ def __init__(
         self.model_name = model_name
         self.parallel_tool_calls = parallel_tool_calls
         self.logprobs = logprobs
+        self.conversation_history = []
 
     @staticmethod
     def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
@@ -64,9 +66,7 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
 
     async def get_system_messages(self) -> list[Any]:
         """Get system messages for OpenAI."""
-        return [
-            {"role": "system", "content": self.system_prompt},
-        ]
+        return [{"role": "system", "content": self.system_prompt}]
 
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""
@@ -96,8 +96,14 @@ def get_tool_schemas(self) -> list[dict]:
             openai_tools.append(openai_tool)
         return openai_tools
 
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
     async def get_response(self, messages: list[Any]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
+
         # Convert MCP tool schemas to OpenAI format
         mcp_schemas = self.get_tool_schemas()
 
@@ -112,6 +118,19 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         choice = response.choices[0]
         msg = choice.message
 
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+
+        if msg.content:
+            assistant_msg["content"] = msg.content
+
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+
+        messages.append(assistant_msg)
+
+        # Store the complete conversation history
+        self.conversation_history = messages.copy()
+
         tool_calls = []
         if msg.tool_calls:
             for tc in msg.tool_calls:
@@ -123,7 +142,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         return AgentResponse(
             content=msg.content or "",
             tool_calls=tool_calls,
-            done=choice.finish_reason == "stop",
+            done=choice.finish_reason in ("stop", "length"),
             raw=response,  # Include raw response for access to Choice objects
         )
 
@@ -144,11 +163,10 @@ async def format_tool_results(
                     for c in res.content
                     if hasattr(c, "text")
                 )
-            rendered.append(
-                {
-                    "role": "tool",
-                    "tool_call_id": call.id,
-                    "content": content or "",  # Ensure content is never None
-                }
-            )
+            tool_msg = {
+                "role": "tool",
+                "tool_call_id": call.id,
+                "content": content or "",  # Ensure content is never None
+            }
+            rendered.append(tool_msg)
         return rendered