diff --git a/examples/openai_2048.py b/examples/openai_2048.py new file mode 100644 index 000000000..f83aab590 --- /dev/null +++ b/examples/openai_2048.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +OpenAI Chat Agent playing Text 2048 + +This example demonstrates using the OpenAIChatAgent with the text-2048 environment. +It shows how to: +- Initialize an OpenAI client with the openai_chat agent +- Configure the text-2048 environment +- Run the agent to play the game + +Requirements: +- pip install openai +- export OPENAI_API_KEY="your-api-key" # Or set OPENAI_BASE_URL for custom endpoints + +Environment Variables: +- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint +- OPENAI_API_KEY: API key for authentication +""" + +import asyncio +import os +from openai import AsyncOpenAI +import hud +from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.clients import MCPClient +from hud.datasets import Task + + +async def main(): + # Initialize OpenAI client with environment variables + base_url = os.getenv("OPENAI_BASE_URL") + api_key = os.getenv("OPENAI_API_KEY") + + openai_client = AsyncOpenAI( + base_url=base_url if base_url else None, # None will use default OpenAI endpoint + api_key=api_key, + ) + + mcp_config = { + "local": { + "command": "docker", + "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"], + } + } + + system_prompt = """You are an expert 2048 game player. Your goal is to reach the tile specified by the user. + +HOW 2048 WORKS: +- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...) +- When you move, all tiles slide in that direction +- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.) +- After each move, a new tile (2 or 4) appears randomly +- Game ends when grid is full and no merges possible + +CRITICAL RULES: +- ALWAYS analyze the board before moving +- ALWAYS make a tool call for your move +- Use the 'move' tool with these choices: "up", "down", "left", or "right" +- Remember: ALL strings in JSON must have quotes! +- Make exactly ONE move per turn +- NEVER ask for permission - just keep playing until the game ends +- Don't ask "Should I continue?" - just make your next move + +Example tool call: {"name": "move", "arguments": {"direction": "right"}}""" + + # Define the task with game setup and evaluation + task = Task( + prompt="""Aim for the 128 tile (atleast a score of 800!)""", + mcp_config=mcp_config, + setup_tool={ + "name": "setup", + "arguments": {"name": "board", "arguments": {"board_size": 4}}, + }, # type: ignore + evaluate_tool={ + "name": "evaluate", + "arguments": {"name": "max_number", "arguments": {"target": 128}}, + }, # type: ignore + ) + + # Initialize MCP client + client = MCPClient(mcp_config=task.mcp_config) + + model_name = "gpt-5-mini" # Replace with your model name + + # Create OpenAI agent with the text-2048 game tools + agent = GenericOpenAIChatAgent( + mcp_client=client, + openai_client=openai_client, + model_name=model_name, + allowed_tools=["move"], + parallel_tool_calls=False, + system_prompt=system_prompt, + ) + + agent.metadata = {} + + with hud.trace("OpenAI 2048 Game"): + try: + print("šŸŽ® Starting 2048 game with OpenAI agent...") + print(f"šŸ¤– Model: {agent.model_name}") + print("=" * 50) + + result = await agent.run(task, max_steps=100) + + # Display results + print("=" * 50) + print(f"āœ… Game completed!") + print(f"šŸ† Final Score/Max Tile: {result.reward}") + if result.info: + print(f"šŸ“Š Game Stats: {result.info}") + + # Display conversation history + print("šŸ—£ļø Conversation History:") + for i, msg in enumerate(agent.conversation_history): + print(f" {i + 1} : {msg}") + print("-" * 30) + + except Exception as e: + print(f"āŒ Error during game: {e}") + finally: + await client.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py new file mode 100644 index 000000000..e9a3723c9 --- /dev/null +++ b/examples/openai_browser_2048.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +""" +OpenAI Chat Agent playing Browser 2048 + +This example demonstrates using the OpenAIChatAgent with the browser-based 2048 game. +It shows how to: +- Initialize an OpenAI client with browser automation capabilities +- Configure the browser-2048 environment with Docker +- Use computer vision and interaction tools to play the game + +Requirements: +- pip install openai +- export OPENAI_API_KEY="your-api-key" # Or set OPENAI_BASE_URL for custom endpoints +- Docker installed and running + +Environment Variables: +- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional) +- OPENAI_API_KEY: API key for authentication +""" + +import asyncio +import os +from openai import AsyncOpenAI +import hud +from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.clients import MCPClient +from hud.datasets import Task + + +async def main(): + # Initialize OpenAI client with environment variables + base_url = os.getenv("OPENAI_BASE_URL") + api_key = os.getenv("OPENAI_API_KEY") + + openai_client = AsyncOpenAI( + base_url=base_url if base_url else None, + api_key=api_key, + ) + + # Configure the browser-2048 environment + mcp_config = { + "local": { + "command": "docker", + "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"], + } + } + + system_prompt = """You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user. + +HOW 2048 WORKS: +- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...) +- When you move, all tiles slide in that direction +- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.) +- After each move, a new tile (2 or 4) appears randomly +- Game ends when grid is full and no merges possible + +BROWSER INTERACTION USING THE COMPUTER TOOL: +1. FIRST TURN ONLY - TAKE SCREENSHOT: + Use: computer(action="screenshot") + This captures the initial game state. Only needed for your first turn. + After that, the environment will automatically return an image with each successful move. + +2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press": + - Move UP: computer(action="press", keys=["up"]) + - Move DOWN: computer(action="press", keys=["down"]) + - Move LEFT: computer(action="press", keys=["left"]) + - Move RIGHT: computer(action="press", keys=["right"]) + +CRITICAL RULES: +- Make exactly ONE move per turn using the press action with arrow keys +- Continue playing until you reach the target or the game ends, no need to ask the user for confirmation. + +Strategy tips: +- Keep your highest tiles in a corner +- Build tiles in descending order from the corner +- Avoid random moves - be strategic +- Try to keep the board organized""" + + # Define the task with browser game setup and evaluation + task = Task( + prompt="""Play the browser-based 2048 game and try to reach the 128 tile. + + Start by taking a screenshot to see the initial game board, then make strategic moves using arrow keys. + After your first screenshot, the game board will be automatically shown after each successful move.""", + mcp_config=mcp_config, + setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}}, # type: ignore + evaluate_tool={ + "name": "evaluate", + "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}}, + }, # type: ignore + ) + + # Initialize MCP client + client = MCPClient(mcp_config=task.mcp_config) + + model_name = "z-ai/glm-4.5v" # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc... + + # Create OpenAI agent with browser automation tools + agent = GenericOpenAIChatAgent( + mcp_client=client, + openai_client=openai_client, + model_name=model_name, + allowed_tools=["computer"], + parallel_tool_calls=False, + system_prompt=system_prompt, + ) + + agent.metadata = {} + + # Run the game with tracing + with hud.trace("OpenAI Browser 2048 Game"): + try: + print("šŸŽ® Starting browser-based 2048 game with OpenAI agent...") + print(f"šŸ¤– Model: {agent.model_name}") + print(f"🌐 Browser environment running on localhost:8080") + print("=" * 50) + + result = await agent.run(task, max_steps=100) + + # Display results + print("=" * 50) + print(f"āœ… Game completed!") + print(f"šŸ† Final Score/Max Tile: {result.reward}") + if result.info: + print(f"šŸ“Š Game Stats: {result.info}") + + print("\nšŸ“ Full interaction trace:") + for i, msg in enumerate(agent.conversation_history): + print(f" {i + 1} : {msg}") + print("-" * 30) + + except Exception as e: + print(f"āŒ Error during game: {e}") + finally: + await client.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py index 4e7df2c2e..2b24b2113 100644 --- a/hud/agents/misc/response_agent.py +++ b/hud/agents/misc/response_agent.py @@ -54,7 +54,7 @@ async def determine_response(self, agent_message: str) -> ResponseType: """ try: response = await self.client.chat.completions.create( - model="gpt-4o", + model="gpt-5-nano", messages=[ {"role": "system", "content": self.system_prompt}, { diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index 847dc158d..8e61be940 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -21,6 +21,7 @@ import mcp.types as types +from hud import instrument from hud.types import AgentResponse, MCPToolCall, MCPToolResult from .base import MCPAgent @@ -52,6 +53,7 @@ def __init__( self.model_name = model_name self.parallel_tool_calls = parallel_tool_calls self.logprobs = logprobs + self.conversation_history = [] @staticmethod def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type] @@ -64,40 +66,114 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type] async def get_system_messages(self) -> list[Any]: """Get system messages for OpenAI.""" - return [ - {"role": "system", "content": self.system_prompt}, - ] + return [{"role": "system", "content": self.system_prompt}] async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]: """Format blocks for OpenAI.""" - return [ - { - "role": "user", - "content": [ - {"type": "text", "text": block.text} - for block in blocks - if isinstance(block, types.TextContent) - ], - }, - ] + content = [] + for block in blocks: + if isinstance(block, types.TextContent): + content.append({"type": "text", "text": block.text}) + elif isinstance(block, types.ImageContent): + content.append( + { + "type": "image_url", + "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"}, + } + ) + + return [{"role": "user", "content": content}] + + def _sanitize_schema_for_openai(self, schema: dict) -> dict: + """Convert MCP JSON Schema to OpenAI-compatible format. + + Handles unsupported features like anyOf and prefixItems. + """ + if not isinstance(schema, dict): + return schema + + sanitized = {} + + for key, value in schema.items(): + if key == "anyOf" and isinstance(value, list): + # Handle anyOf patterns (usually for nullable fields) + non_null_types = [ + v for v in value if not (isinstance(v, dict) and v.get("type") == "null") + ] + if non_null_types: + # Use the first non-null type + sanitized.update(self._sanitize_schema_for_openai(non_null_types[0])) + else: + sanitized["type"] = "string" # Fallback + + elif key == "prefixItems": + # Convert prefixItems to simple items + sanitized["type"] = "array" + if isinstance(value, list) and value: + # Use the type from the first item as the items schema + first_item = value[0] + if isinstance(first_item, dict): + sanitized["items"] = {"type": first_item.get("type", "string")} + else: + sanitized["items"] = {"type": "string"} + + elif key == "properties" and isinstance(value, dict): + # Recursively sanitize property schemas + sanitized[key] = { + prop_name: self._sanitize_schema_for_openai(prop_schema) + for prop_name, prop_schema in value.items() + } + + elif key == "items" and isinstance(value, dict): + # Recursively sanitize items schema + sanitized[key] = self._sanitize_schema_for_openai(value) + + elif key in ( + "type", + "description", + "enum", + "required", + "default", + "minimum", + "maximum", + "minItems", + "maxItems", + ): + # These are supported by OpenAI + sanitized[key] = value + + return sanitized or {"type": "object"} def get_tool_schemas(self) -> list[dict]: tool_schemas = super().get_tool_schemas() openai_tools = [] for schema in tool_schemas: + parameters = schema.get("parameters", {}) + + if parameters: + sanitized_params = self._sanitize_schema_for_openai(parameters) + else: + sanitized_params = {"type": "object", "properties": {}} + openai_tool = { "type": "function", "function": { "name": schema["name"], "description": schema.get("description", ""), - "parameters": schema.get("parameters", {"type": "object", "properties": {}}), + "parameters": sanitized_params, }, } openai_tools.append(openai_tool) return openai_tools + @instrument( + span_type="agent", + record_args=False, + record_result=True, + ) async def get_response(self, messages: list[Any]) -> AgentResponse: """Send chat request to OpenAI and convert the response.""" + # Convert MCP tool schemas to OpenAI format mcp_schemas = self.get_tool_schemas() @@ -112,6 +188,19 @@ async def get_response(self, messages: list[Any]) -> AgentResponse: choice = response.choices[0] msg = choice.message + assistant_msg: dict[str, Any] = {"role": "assistant"} + + if msg.content: + assistant_msg["content"] = msg.content + + if msg.tool_calls: + assistant_msg["tool_calls"] = msg.tool_calls + + messages.append(assistant_msg) + + # Store the complete conversation history + self.conversation_history = messages.copy() + tool_calls = [] if msg.tool_calls: for tc in msg.tool_calls: @@ -123,7 +212,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse: return AgentResponse( content=msg.content or "", tool_calls=tool_calls, - done=choice.finish_reason == "stop", + done=choice.finish_reason in ("stop", "length"), raw=response, # Include raw response for access to Choice objects ) @@ -132,23 +221,65 @@ async def format_tool_results( tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult], ) -> list[Any]: - """Render MCP tool results as OpenAI ``role=tool`` messages.""" + """Render MCP tool results as OpenAI messages. + + Note: OpenAI tool messages only support string content. + When images are present, we return both a tool message and a user message. + """ rendered: list[dict[str, Any]] = [] for call, res in zip(tool_calls, tool_results, strict=False): - if res.structuredContent: - content = json.dumps(res.structuredContent) - else: - # Concatenate any TextContent blocks - content = "".join( - c.text # type: ignore[attr-defined] - for c in res.content - if hasattr(c, "text") - ) + # Use structuredContent.result if available, otherwise use content + items = res.content + if res.structuredContent and isinstance(res.structuredContent, dict): + items = res.structuredContent.get("result", res.content) + + # Separate text and image content + text_parts = [] + image_parts = [] + + for item in items: + if isinstance(item, dict): + if item.get("type") == "text": + text_parts.append(item.get("text", "")) + elif item.get("type") == "image": + image_parts.append( + { + "type": "image_url", + "image_url": { + "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}" + }, + } + ) + elif isinstance(item, types.TextContent): + text_parts.append(item.text) + elif isinstance(item, types.ImageContent): + image_parts.append( + { + "type": "image_url", + "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"}, + } + ) + + text_content = "".join(text_parts) if text_parts else "Tool executed successfully" rendered.append( { "role": "tool", "tool_call_id": call.id, - "content": content or "", # Ensure content is never None + "content": text_content, } ) + + # If there are images, add them as a separate user message + if image_parts: + # Add a user message with the images + content_with_images = [ + {"type": "text", "text": "Tool returned the following:"} + ] + image_parts + rendered.append( + { + "role": "user", + "content": content_with_images, + } + ) + return rendered diff --git a/hud/datasets/execution/parallel.py b/hud/datasets/execution/parallel.py index dfa385219..45311b915 100644 --- a/hud/datasets/execution/parallel.py +++ b/hud/datasets/execution/parallel.py @@ -75,8 +75,8 @@ def _process_worker( pass # Set up signal handler for clean interruption - def signal_handler(signum, frame): - logger.warning(f"Worker {worker_id}: Received interrupt signal") + def signal_handler(signum: int, frame: Any) -> None: + logger.warning("Worker %s: Received interrupt signal", worker_id) # Raise KeyboardInterrupt to actually interrupt the worker raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user") @@ -171,7 +171,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in results = await asyncio.gather(*tasks, return_exceptions=False) return results except asyncio.CancelledError: - logger.info(f"Worker {worker_id}: Tasks cancelled due to interruption") + logger.info("Worker %s: Tasks cancelled due to interruption", worker_id) # Return error results for all tasks return [ ( @@ -208,7 +208,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in return results except KeyboardInterrupt: - logger.info(f"Worker {worker_id}: Interrupted by user, stopping gracefully") + logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id) # Return partial results for tasks that completed partial_results = [] for idx, _ in task_batch: @@ -489,7 +489,7 @@ async def run_dataset_parallel_manual( "content": "Task interrupted (Ctrl+C)", } - logger.info(f"Interrupted after {completed}/{total} tasks") + logger.info("Interrupted after %s/%s tasks", completed, total) raise # Re-raise to propagate the interrupt finally: diff --git a/hud/otel/exporters.py b/hud/otel/exporters.py index 4d5dd9a1e..82e2ac954 100644 --- a/hud/otel/exporters.py +++ b/hud/otel/exporters.py @@ -14,9 +14,9 @@ import contextlib import json import logging +import time from collections import defaultdict from datetime import UTC, datetime -import time from typing import TYPE_CHECKING, Any from mcp.types import ClientRequest, ServerResult