diff --git a/examples/openai_2048.py b/examples/openai_2048.py
new file mode 100644
index 000000000..f83aab590
--- /dev/null
+++ b/examples/openai_2048.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Text 2048
+
+This example demonstrates using the OpenAIChatAgent with the text-2048 environment.
+It shows how to:
+- Initialize an OpenAI client with the openai_chat agent
+- Configure the text-2048 environment
+- Run the agent to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
+
+    openai_client = AsyncOpenAI(
+        base_url=base_url if base_url else None,  # None will use default OpenAI endpoint
+        api_key=api_key,
+    )
+
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"],
+        }
+    }
+
+    system_prompt = """You are an expert 2048 game player. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+CRITICAL RULES:
+- ALWAYS analyze the board before moving
+- ALWAYS make a tool call for your move
+- Use the 'move' tool with these choices: "up", "down", "left", or "right"
+- Remember: ALL strings in JSON must have quotes!
+- Make exactly ONE move per turn
+- NEVER ask for permission - just keep playing until the game ends
+- Don't ask "Should I continue?" - just make your next move
+
+Example tool call: {"name": "move", "arguments": {"direction": "right"}}"""
+
+    # Define the task with game setup and evaluation
+    task = Task(
+        prompt="""Aim for the 128 tile (atleast a score of 800!)""",
+        mcp_config=mcp_config,
+        setup_tool={
+            "name": "setup",
+            "arguments": {"name": "board", "arguments": {"board_size": 4}},
+        },  # type: ignore
+        evaluate_tool={
+            "name": "evaluate",
+            "arguments": {"name": "max_number", "arguments": {"target": 128}},
+        },  # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    model_name = "gpt-5-mini"  # Replace with your model name
+
+    # Create OpenAI agent with the text-2048 game tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name=model_name,
+        allowed_tools=["move"],
+        parallel_tool_calls=False,
+        system_prompt=system_prompt,
+    )
+
+    agent.metadata = {}
+
+    with hud.trace("OpenAI 2048 Game"):
+        try:
+            print("🎮 Starting 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print("=" * 50)
+
+            result = await agent.run(task, max_steps=100)
+
+            # Display results
+            print("=" * 50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+            # Display conversation history
+            print("🗣️ Conversation History:")
+            for i, msg in enumerate(agent.conversation_history):
+                print(f"  {i + 1} : {msg}")
+                print("-" * 30)
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py
new file mode 100644
index 000000000..e9a3723c9
--- /dev/null
+++ b/examples/openai_browser_2048.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+OpenAI Chat Agent playing Browser 2048
+
+This example demonstrates using the OpenAIChatAgent with the browser-based 2048 game.
+It shows how to:
+- Initialize an OpenAI client with browser automation capabilities
+- Configure the browser-2048 environment with Docker
+- Use computer vision and interaction tools to play the game
+
+Requirements:
+- pip install openai
+- export OPENAI_API_KEY="your-api-key"  # Or set OPENAI_BASE_URL for custom endpoints
+- Docker installed and running
+
+Environment Variables:
+- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional)
+- OPENAI_API_KEY: API key for authentication
+"""
+
+import asyncio
+import os
+from openai import AsyncOpenAI
+import hud
+from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def main():
+    # Initialize OpenAI client with environment variables
+    base_url = os.getenv("OPENAI_BASE_URL")
+    api_key = os.getenv("OPENAI_API_KEY")
+
+    openai_client = AsyncOpenAI(
+        base_url=base_url if base_url else None,
+        api_key=api_key,
+    )
+
+    # Configure the browser-2048 environment
+    mcp_config = {
+        "local": {
+            "command": "docker",
+            "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
+        }
+    }
+
+    system_prompt = """You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user.
+
+HOW 2048 WORKS:
+- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)
+- When you move, all tiles slide in that direction
+- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)
+- After each move, a new tile (2 or 4) appears randomly
+- Game ends when grid is full and no merges possible
+
+BROWSER INTERACTION USING THE COMPUTER TOOL:
+1. FIRST TURN ONLY - TAKE SCREENSHOT:
+   Use: computer(action="screenshot")
+   This captures the initial game state. Only needed for your first turn.
+   After that, the environment will automatically return an image with each successful move.
+
+2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press":
+   - Move UP: computer(action="press", keys=["up"])
+   - Move DOWN: computer(action="press", keys=["down"]) 
+   - Move LEFT: computer(action="press", keys=["left"])
+   - Move RIGHT: computer(action="press", keys=["right"])
+
+CRITICAL RULES:
+- Make exactly ONE move per turn using the press action with arrow keys
+- Continue playing until you reach the target or the game ends, no need to ask the user for confirmation.
+
+Strategy tips:
+- Keep your highest tiles in a corner
+- Build tiles in descending order from the corner
+- Avoid random moves - be strategic
+- Try to keep the board organized"""
+
+    # Define the task with browser game setup and evaluation
+    task = Task(
+        prompt="""Play the browser-based 2048 game and try to reach the 128 tile.
+
+        Start by taking a screenshot to see the initial game board, then make strategic moves using arrow keys.
+        After your first screenshot, the game board will be automatically shown after each successful move.""",
+        mcp_config=mcp_config,
+        setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}},  # type: ignore
+        evaluate_tool={
+            "name": "evaluate",
+            "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
+        },  # type: ignore
+    )
+
+    # Initialize MCP client
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    model_name = "z-ai/glm-4.5v"  # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc...
+
+    # Create OpenAI agent with browser automation tools
+    agent = GenericOpenAIChatAgent(
+        mcp_client=client,
+        openai_client=openai_client,
+        model_name=model_name,
+        allowed_tools=["computer"],
+        parallel_tool_calls=False,
+        system_prompt=system_prompt,
+    )
+
+    agent.metadata = {}
+
+    # Run the game with tracing
+    with hud.trace("OpenAI Browser 2048 Game"):
+        try:
+            print("🎮 Starting browser-based 2048 game with OpenAI agent...")
+            print(f"🤖 Model: {agent.model_name}")
+            print(f"🌐 Browser environment running on localhost:8080")
+            print("=" * 50)
+
+            result = await agent.run(task, max_steps=100)
+
+            # Display results
+            print("=" * 50)
+            print(f"✅ Game completed!")
+            print(f"🏆 Final Score/Max Tile: {result.reward}")
+            if result.info:
+                print(f"📊 Game Stats: {result.info}")
+
+            print("\n📝 Full interaction trace:")
+            for i, msg in enumerate(agent.conversation_history):
+                print(f"  {i + 1} : {msg}")
+                print("-" * 30)
+
+        except Exception as e:
+            print(f"❌ Error during game: {e}")
+        finally:
+            await client.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py
index 4e7df2c2e..2b24b2113 100644
--- a/hud/agents/misc/response_agent.py
+++ b/hud/agents/misc/response_agent.py
@@ -54,7 +54,7 @@ async def determine_response(self, agent_message: str) -> ResponseType:
         """
         try:
             response = await self.client.chat.completions.create(
-                model="gpt-4o",
+                model="gpt-5-nano",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
                     {
diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py
index 847dc158d..8e61be940 100644
--- a/hud/agents/openai_chat_generic.py
+++ b/hud/agents/openai_chat_generic.py
@@ -21,6 +21,7 @@
 
 import mcp.types as types
 
+from hud import instrument
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 
 from .base import MCPAgent
@@ -52,6 +53,7 @@ def __init__(
         self.model_name = model_name
         self.parallel_tool_calls = parallel_tool_calls
         self.logprobs = logprobs
+        self.conversation_history = []
 
     @staticmethod
     def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
@@ -64,40 +66,114 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
 
     async def get_system_messages(self) -> list[Any]:
         """Get system messages for OpenAI."""
-        return [
-            {"role": "system", "content": self.system_prompt},
-        ]
+        return [{"role": "system", "content": self.system_prompt}]
 
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
         """Format blocks for OpenAI."""
-        return [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": block.text}
-                    for block in blocks
-                    if isinstance(block, types.TextContent)
-                ],
-            },
-        ]
+        content = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                content.append({"type": "text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
+                    }
+                )
+
+        return [{"role": "user", "content": content}]
+
+    def _sanitize_schema_for_openai(self, schema: dict) -> dict:
+        """Convert MCP JSON Schema to OpenAI-compatible format.
+
+        Handles unsupported features like anyOf and prefixItems.
+        """
+        if not isinstance(schema, dict):
+            return schema
+
+        sanitized = {}
+
+        for key, value in schema.items():
+            if key == "anyOf" and isinstance(value, list):
+                # Handle anyOf patterns (usually for nullable fields)
+                non_null_types = [
+                    v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
+                ]
+                if non_null_types:
+                    # Use the first non-null type
+                    sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
+                else:
+                    sanitized["type"] = "string"  # Fallback
+
+            elif key == "prefixItems":
+                # Convert prefixItems to simple items
+                sanitized["type"] = "array"
+                if isinstance(value, list) and value:
+                    # Use the type from the first item as the items schema
+                    first_item = value[0]
+                    if isinstance(first_item, dict):
+                        sanitized["items"] = {"type": first_item.get("type", "string")}
+                    else:
+                        sanitized["items"] = {"type": "string"}
+
+            elif key == "properties" and isinstance(value, dict):
+                # Recursively sanitize property schemas
+                sanitized[key] = {
+                    prop_name: self._sanitize_schema_for_openai(prop_schema)
+                    for prop_name, prop_schema in value.items()
+                }
+
+            elif key == "items" and isinstance(value, dict):
+                # Recursively sanitize items schema
+                sanitized[key] = self._sanitize_schema_for_openai(value)
+
+            elif key in (
+                "type",
+                "description",
+                "enum",
+                "required",
+                "default",
+                "minimum",
+                "maximum",
+                "minItems",
+                "maxItems",
+            ):
+                # These are supported by OpenAI
+                sanitized[key] = value
+
+        return sanitized or {"type": "object"}
 
     def get_tool_schemas(self) -> list[dict]:
         tool_schemas = super().get_tool_schemas()
         openai_tools = []
         for schema in tool_schemas:
+            parameters = schema.get("parameters", {})
+
+            if parameters:
+                sanitized_params = self._sanitize_schema_for_openai(parameters)
+            else:
+                sanitized_params = {"type": "object", "properties": {}}
+
             openai_tool = {
                 "type": "function",
                 "function": {
                     "name": schema["name"],
                     "description": schema.get("description", ""),
-                    "parameters": schema.get("parameters", {"type": "object", "properties": {}}),
+                    "parameters": sanitized_params,
                 },
             }
             openai_tools.append(openai_tool)
         return openai_tools
 
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
     async def get_response(self, messages: list[Any]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
+
         # Convert MCP tool schemas to OpenAI format
         mcp_schemas = self.get_tool_schemas()
 
@@ -112,6 +188,19 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         choice = response.choices[0]
         msg = choice.message
 
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+
+        if msg.content:
+            assistant_msg["content"] = msg.content
+
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+
+        messages.append(assistant_msg)
+
+        # Store the complete conversation history
+        self.conversation_history = messages.copy()
+
         tool_calls = []
         if msg.tool_calls:
             for tc in msg.tool_calls:
@@ -123,7 +212,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         return AgentResponse(
             content=msg.content or "",
             tool_calls=tool_calls,
-            done=choice.finish_reason == "stop",
+            done=choice.finish_reason in ("stop", "length"),
             raw=response,  # Include raw response for access to Choice objects
         )
 
@@ -132,23 +221,65 @@ async def format_tool_results(
         tool_calls: list[MCPToolCall],
         tool_results: list[MCPToolResult],
     ) -> list[Any]:
-        """Render MCP tool results as OpenAI ``role=tool`` messages."""
+        """Render MCP tool results as OpenAI messages.
+
+        Note: OpenAI tool messages only support string content.
+        When images are present, we return both a tool message and a user message.
+        """
         rendered: list[dict[str, Any]] = []
         for call, res in zip(tool_calls, tool_results, strict=False):
-            if res.structuredContent:
-                content = json.dumps(res.structuredContent)
-            else:
-                # Concatenate any TextContent blocks
-                content = "".join(
-                    c.text  # type: ignore[attr-defined]
-                    for c in res.content
-                    if hasattr(c, "text")
-                )
+            # Use structuredContent.result if available, otherwise use content
+            items = res.content
+            if res.structuredContent and isinstance(res.structuredContent, dict):
+                items = res.structuredContent.get("result", res.content)
+
+            # Separate text and image content
+            text_parts = []
+            image_parts = []
+
+            for item in items:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                    elif item.get("type") == "image":
+                        image_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}"
+                                },
+                            }
+                        )
+                elif isinstance(item, types.TextContent):
+                    text_parts.append(item.text)
+                elif isinstance(item, types.ImageContent):
+                    image_parts.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
+                        }
+                    )
+
+            text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
             rendered.append(
                 {
                     "role": "tool",
                     "tool_call_id": call.id,
-                    "content": content or "",  # Ensure content is never None
+                    "content": text_content,
                 }
             )
+
+            # If there are images, add them as a separate user message
+            if image_parts:
+                # Add a user message with the images
+                content_with_images = [
+                    {"type": "text", "text": "Tool returned the following:"}
+                ] + image_parts
+                rendered.append(
+                    {
+                        "role": "user",
+                        "content": content_with_images,
+                    }
+                )
+
         return rendered
diff --git a/hud/datasets/execution/parallel.py b/hud/datasets/execution/parallel.py
index dfa385219..45311b915 100644
--- a/hud/datasets/execution/parallel.py
+++ b/hud/datasets/execution/parallel.py
@@ -75,8 +75,8 @@ def _process_worker(
         pass
 
     # Set up signal handler for clean interruption
-    def signal_handler(signum, frame):
-        logger.warning(f"Worker {worker_id}: Received interrupt signal")
+    def signal_handler(signum: int, frame: Any) -> None:
+        logger.warning("Worker %s: Received interrupt signal", worker_id)
         # Raise KeyboardInterrupt to actually interrupt the worker
         raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
 
@@ -171,7 +171,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in
             results = await asyncio.gather(*tasks, return_exceptions=False)
             return results
         except asyncio.CancelledError:
-            logger.info(f"Worker {worker_id}: Tasks cancelled due to interruption")
+            logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
             # Return error results for all tasks
             return [
                 (
@@ -208,7 +208,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in
 
         return results
     except KeyboardInterrupt:
-        logger.info(f"Worker {worker_id}: Interrupted by user, stopping gracefully")
+        logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
         # Return partial results for tasks that completed
         partial_results = []
         for idx, _ in task_batch:
@@ -489,7 +489,7 @@ async def run_dataset_parallel_manual(
                             "content": "Task interrupted (Ctrl+C)",
                         }
 
-                logger.info(f"Interrupted after {completed}/{total} tasks")
+                logger.info("Interrupted after %s/%s tasks", completed, total)
                 raise  # Re-raise to propagate the interrupt
 
         finally:
diff --git a/hud/otel/exporters.py b/hud/otel/exporters.py
index 4d5dd9a1e..82e2ac954 100644
--- a/hud/otel/exporters.py
+++ b/hud/otel/exporters.py
@@ -14,9 +14,9 @@
 import contextlib
 import json
 import logging
+import time
 from collections import defaultdict
 from datetime import UTC, datetime
-import time
 from typing import TYPE_CHECKING, Any
 
 from mcp.types import ClientRequest, ServerResult