From 4c2fd939d3f9f8ff830fe44d9f9719ff26d17d39 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 29 Aug 2025 20:00:16 -0700 Subject: [PATCH 1/7] Add OpenAI 2048 example and fix agent loop --- examples/openai_2048.py | 127 ++++++++++++++++++++++++++++++ hud/agents/misc/response_agent.py | 2 +- hud/agents/openai_chat_generic.py | 38 ++++++--- 3 files changed, 156 insertions(+), 11 deletions(-) create mode 100644 examples/openai_2048.py diff --git a/examples/openai_2048.py b/examples/openai_2048.py new file mode 100644 index 000000000..6775a2e5d --- /dev/null +++ b/examples/openai_2048.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +OpenAI Chat Agent playing Text 2048 + +This example demonstrates using the OpenAIChatAgent with the text-2048 environment. +It shows how to: +- Initialize an OpenAI client with the openai_chat agent +- Configure the text-2048 environment +- Run the agent to play the game + +Requirements: +- pip install openai +- export OPENAI_API_KEY="your-api-key" # Or set OPENAI_BASE_URL for custom endpoints + +Environment Variables: +- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional) +- OPENAI_API_KEY: API key for authentication +""" + +import asyncio +import os +from openai import AsyncOpenAI +import hud +from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.clients import MCPClient +from hud.datasets import Task + +from hud.agents.misc import ResponseAgent + + +async def main(): + # Initialize OpenAI client with environment variables + base_url = os.getenv("OPENAI_BASE_URL") # Optional custom endpoint + api_key = os.getenv("OPENAI_API_KEY", "EMPTY") # Default to "EMPTY" for local servers + + openai_client = AsyncOpenAI( + base_url=base_url, # None will use default OpenAI endpoint + api_key=api_key, + ) + + # Configure the text-2048 environment + mcp_config = { + "local": { + "command": "docker", + "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"] + } + } + + # Define the task with game setup and evaluation + task = Task( + prompt="""Play the 2048 game strategically. + + Tips for high scores: + - Keep your highest tile in a corner (preferably bottom-right) + - Build tiles in descending order from that corner + - Avoid moving up unless absolutely necessary + - Try to keep tiles of similar values adjacent + + Use the 'move' tool with directions: up, down, left, or right. + Aim for the highest possible score!""", + mcp_config=mcp_config, + setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore + evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore + ) + + # Initialize MCP client + client = MCPClient(mcp_config=task.mcp_config) + + # Create OpenAI agent with the text-2048 game tools + agent = GenericOpenAIChatAgent( + mcp_client=client, + openai_client=openai_client, + model_name="Qwen/Qwen2.5-3B-Instruct", + allowed_tools=["move"], + parallel_tool_calls=False, + response_agent=ResponseAgent(), + system_prompt="""You are an expert 2048 game player. + Make strategic moves to achieve the highest score possible. + Always analyze the board state before making a move.""", + ) + + agent.metadata = {} + + # Run the game with tracing + with hud.trace("OpenAI 2048 Game"): + try: + print("šŸŽ® Starting 2048 game with OpenAI agent...") + print(f"šŸ¤– Model: {agent.model_name}") + print("="*50) + + # Run the task with unlimited steps (game ends when no moves available) + result = await agent.run(task, max_steps=-1) + + # Display results + print("="*50) + print(f"āœ… Game completed!") + print(f"šŸ† Final Score/Max Tile: {result.reward}") + if result.info: + print(f"šŸ“Š Game Stats: {result.info}") + + # Display conversation history + print("šŸ—£ļø Conversation History:") + for i, msg in enumerate(agent.conversation_history): + print(f" {i+1} : {msg}") + print("-"*30) + + except Exception as e: + print(f"āŒ Error during game: {e}") + finally: + await client.shutdown() + + +if __name__ == "__main__": + # Check for API configuration + if not os.getenv("OPENAI_API_KEY") and not os.getenv("OPENAI_BASE_URL"): + print("āš ļø Please configure OpenAI API access:") + print(" For OpenAI API: export OPENAI_API_KEY='your-api-key'") + print(" For local/custom endpoints: export OPENAI_BASE_URL='your-custom-endpoint'") + exit(1) + + # Display configuration + if os.getenv("OPENAI_BASE_URL"): + print(f"šŸ”— Using endpoint: {os.getenv('OPENAI_BASE_URL')}") + else: + print("šŸ”— Using default OpenAI API endpoint") + + asyncio.run(main()) \ No newline at end of file diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py index 4e7df2c2e..2b24b2113 100644 --- a/hud/agents/misc/response_agent.py +++ b/hud/agents/misc/response_agent.py @@ -54,7 +54,7 @@ async def determine_response(self, agent_message: str) -> ResponseType: """ try: response = await self.client.chat.completions.create( - model="gpt-4o", + model="gpt-5-nano", messages=[ {"role": "system", "content": self.system_prompt}, { diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index 847dc158d..ca9a01a13 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -21,6 +21,7 @@ import mcp.types as types +from hud import instrument from hud.types import AgentResponse, MCPToolCall, MCPToolResult from .base import MCPAgent @@ -52,6 +53,7 @@ def __init__( self.model_name = model_name self.parallel_tool_calls = parallel_tool_calls self.logprobs = logprobs + self.conversation_history = [] @staticmethod def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type] @@ -64,9 +66,7 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall: # type: ignore[valid-type] async def get_system_messages(self) -> list[Any]: """Get system messages for OpenAI.""" - return [ - {"role": "system", "content": self.system_prompt}, - ] + return [{"role": "system", "content": self.system_prompt}] async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]: """Format blocks for OpenAI.""" @@ -96,8 +96,14 @@ def get_tool_schemas(self) -> list[dict]: openai_tools.append(openai_tool) return openai_tools + @instrument( + span_type="agent", + record_args=False, + record_result=True, + ) async def get_response(self, messages: list[Any]) -> AgentResponse: """Send chat request to OpenAI and convert the response.""" + # Convert MCP tool schemas to OpenAI format mcp_schemas = self.get_tool_schemas() @@ -111,6 +117,19 @@ async def get_response(self, messages: list[Any]) -> AgentResponse: choice = response.choices[0] msg = choice.message + + assistant_msg: dict[str, Any] = {"role": "assistant"} + + if msg.content: + assistant_msg["content"] = msg.content + + if msg.tool_calls: + assistant_msg["tool_calls"] = msg.tool_calls + + messages.append(assistant_msg) + + # Store the complete conversation history + self.conversation_history = messages.copy() tool_calls = [] if msg.tool_calls: @@ -144,11 +163,10 @@ async def format_tool_results( for c in res.content if hasattr(c, "text") ) - rendered.append( - { - "role": "tool", - "tool_call_id": call.id, - "content": content or "", # Ensure content is never None - } - ) + tool_msg = { + "role": "tool", + "tool_call_id": call.id, + "content": content or "", # Ensure content is never None + } + rendered.append(tool_msg) return rendered From c2488b5fb8ea40b1fbb797be1ccd4e76fd7c95fe Mon Sep 17 00:00:00 2001 From: Jaideep Date: Fri, 29 Aug 2025 20:05:18 -0700 Subject: [PATCH 2/7] update completion handling to consider 'length' as a valid termination reason --- hud/agents/openai_chat_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index ca9a01a13..a370ba00d 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -142,7 +142,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse: return AgentResponse( content=msg.content or "", tool_calls=tool_calls, - done=choice.finish_reason == "stop", + done=choice.finish_reason in ("stop", "length"), raw=response, # Include raw response for access to Choice objects ) From 38b90fa2ce0acab43fb089c03d419a779b0a138e Mon Sep 17 00:00:00 2001 From: Jaideep Date: Wed, 3 Sep 2025 22:48:44 -0700 Subject: [PATCH 3/7] examples --- examples/openai_2048.py | 71 ++++++++--------- examples/openai_browser_2048.py | 137 ++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 39 deletions(-) create mode 100644 examples/openai_browser_2048.py diff --git a/examples/openai_2048.py b/examples/openai_2048.py index 6775a2e5d..387b7dd5e 100644 --- a/examples/openai_2048.py +++ b/examples/openai_2048.py @@ -13,7 +13,7 @@ - export OPENAI_API_KEY="your-api-key" # Or set OPENAI_BASE_URL for custom endpoints Environment Variables: -- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional) +- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint - OPENAI_API_KEY: API key for authentication """ @@ -25,39 +25,47 @@ from hud.clients import MCPClient from hud.datasets import Task -from hud.agents.misc import ResponseAgent - async def main(): # Initialize OpenAI client with environment variables - base_url = os.getenv("OPENAI_BASE_URL") # Optional custom endpoint - api_key = os.getenv("OPENAI_API_KEY", "EMPTY") # Default to "EMPTY" for local servers + base_url = os.getenv("OPENAI_BASE_URL") + api_key = os.getenv("OPENAI_API_KEY") openai_client = AsyncOpenAI( - base_url=base_url, # None will use default OpenAI endpoint + base_url=base_url if base_url else None, # None will use default OpenAI endpoint api_key=api_key, ) - # Configure the text-2048 environment mcp_config = { "local": { "command": "docker", "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"] } } - + + system_prompt = """You are an expert 2048 game player. Your goal is to reach the tile specified by the user. + +HOW 2048 WORKS: +- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...) +- When you move, all tiles slide in that direction +- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.) +- After each move, a new tile (2 or 4) appears randomly +- Game ends when grid is full and no merges possible + +CRITICAL RULES: +- ALWAYS analyze the board before moving +- ALWAYS make a tool call for your move +- Use the 'move' tool with these choices: "up", "down", "left", or "right" +- Remember: ALL strings in JSON must have quotes! +- Make exactly ONE move per turn +- NEVER ask for permission - just keep playing until the game ends +- Don't ask "Should I continue?" - just make your next move + +Example tool call: {"name": "move", "arguments": {"direction": "right"}}""" + # Define the task with game setup and evaluation task = Task( - prompt="""Play the 2048 game strategically. - - Tips for high scores: - - Keep your highest tile in a corner (preferably bottom-right) - - Build tiles in descending order from that corner - - Avoid moving up unless absolutely necessary - - Try to keep tiles of similar values adjacent - - Use the 'move' tool with directions: up, down, left, or right. - Aim for the highest possible score!""", + prompt="""Aim for the 128 tile (atleast 800 points!)""", mcp_config=mcp_config, setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore @@ -65,30 +73,27 @@ async def main(): # Initialize MCP client client = MCPClient(mcp_config=task.mcp_config) - + + model_name = "gpt-5-mini" # Replace with your model name + # Create OpenAI agent with the text-2048 game tools agent = GenericOpenAIChatAgent( mcp_client=client, openai_client=openai_client, - model_name="Qwen/Qwen2.5-3B-Instruct", + model_name=model_name, allowed_tools=["move"], parallel_tool_calls=False, - response_agent=ResponseAgent(), - system_prompt="""You are an expert 2048 game player. - Make strategic moves to achieve the highest score possible. - Always analyze the board state before making a move.""", + system_prompt=system_prompt, ) agent.metadata = {} - # Run the game with tracing with hud.trace("OpenAI 2048 Game"): try: print("šŸŽ® Starting 2048 game with OpenAI agent...") print(f"šŸ¤– Model: {agent.model_name}") print("="*50) - # Run the task with unlimited steps (game ends when no moves available) result = await agent.run(task, max_steps=-1) # Display results @@ -111,17 +116,5 @@ async def main(): if __name__ == "__main__": - # Check for API configuration - if not os.getenv("OPENAI_API_KEY") and not os.getenv("OPENAI_BASE_URL"): - print("āš ļø Please configure OpenAI API access:") - print(" For OpenAI API: export OPENAI_API_KEY='your-api-key'") - print(" For local/custom endpoints: export OPENAI_BASE_URL='your-custom-endpoint'") - exit(1) - - # Display configuration - if os.getenv("OPENAI_BASE_URL"): - print(f"šŸ”— Using endpoint: {os.getenv('OPENAI_BASE_URL')}") - else: - print("šŸ”— Using default OpenAI API endpoint") - + asyncio.run(main()) \ No newline at end of file diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py new file mode 100644 index 000000000..ff41ac8e6 --- /dev/null +++ b/examples/openai_browser_2048.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +OpenAI Chat Agent playing Browser 2048 + +This example demonstrates using the OpenAIChatAgent with the browser-based 2048 game. +It shows how to: +- Initialize an OpenAI client with browser automation capabilities +- Configure the browser-2048 environment with Docker +- Use computer vision and interaction tools to play the game + +Requirements: +- pip install openai +- export OPENAI_API_KEY="your-api-key" # Or set OPENAI_BASE_URL for custom endpoints +- Docker installed and running + +Environment Variables: +- OPENAI_BASE_URL: Custom OpenAI-compatible API endpoint (optional) +- OPENAI_API_KEY: API key for authentication +""" + +import asyncio +import os +from openai import AsyncOpenAI +import hud +from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.clients import MCPClient +from hud.datasets import Task + + +async def main(): + # Initialize OpenAI client with environment variables + base_url = os.getenv("OPENAI_BASE_URL") + api_key = os.getenv("OPENAI_API_KEY") + + openai_client = AsyncOpenAI( + base_url=base_url if base_url else None, + api_key=api_key, + ) + + # Configure the browser-2048 environment + mcp_config = { + "local": { + "command": "docker", + "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"] + } + } + + system_prompt = """You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user. + +HOW 2048 WORKS: +- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...) +- When you move, all tiles slide in that direction +- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.) +- After each move, a new tile (2 or 4) appears randomly +- Game ends when grid is full and no merges possible + +BROWSER INTERACTION USING THE COMPUTER TOOL: +1. TAKE SCREENSHOTS: + Use: computer(action="screenshot") + This captures the current game state + +2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press": + - Move UP: computer(action="press", keys=["up"]) + - Move DOWN: computer(action="press", keys=["down"]) + - Move LEFT: computer(action="press", keys=["left"]) + - Move RIGHT: computer(action="press", keys=["right"]) + +CRITICAL RULES: +- Take a screenshot first to see the board state at the start of the game +- Make exactly ONE move per turn using the press action with arrow keys +- Continue playing until you reach the target or the game ends + +Strategy tips: +- Keep your highest tiles in a corner +- Build tiles in descending order from the corner +- Avoid random moves - be strategic +- Try to keep the board organized""" + + # Define the task with browser game setup and evaluation + task = Task( + prompt="""Play the browser-based 2048 game and try to reach the 128 tile. + + Take screenshots to see the game board, then make strategic moves using the browser interface. + You can use arrow keys or mouse gestures to move tiles.""", + mcp_config=mcp_config, + setup_tool={ + "name": "launch_app", + "arguments": {"app_name": "2048"} + }, # type: ignore + evaluate_tool={ + "name": "evaluate", + "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}}, + }, # type: ignore + ) + + # Initialize MCP client + client = MCPClient(mcp_config=task.mcp_config) + + model_name = "z-ai/glm-4.5v" + + # Create OpenAI agent with browser automation tools + agent = GenericOpenAIChatAgent( + mcp_client=client, + openai_client=openai_client, + model_name=model_name, + allowed_tools=["computer"], # Computer tool for browser automation + parallel_tool_calls=False, + system_prompt=system_prompt, + ) + + agent.metadata = {} + + # Run the game with tracing + with hud.trace("OpenAI Browser 2048 Game"): + try: + print("šŸŽ® Starting browser-based 2048 game with OpenAI agent...") + print(f"šŸ¤– Model: {agent.model_name}") + print(f"🌐 Browser environment running on localhost:8080") + print("="*50) + + result = await agent.run(task, max_steps=10) + + # Display results + print("="*50) + print(f"āœ… Game completed!") + print(f"šŸ† Final Score/Max Tile: {result.reward}") + if result.info: + print(f"šŸ“Š Game Stats: {result.info}") + + except Exception as e: + print(f"āŒ Error during game: {e}") + finally: + await client.shutdown() + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 2f974ef530b46bece3f82081348c2aa7b3a63336 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Wed, 3 Sep 2025 22:55:58 -0700 Subject: [PATCH 4/7] ruff --- examples/openai_2048.py | 30 ++++++++++++++++-------------- examples/openai_browser_2048.py | 25 +++++++++++-------------- hud/agents/openai_chat_generic.py | 10 +++++----- hud/cli/eval.py | 6 ++---- hud/cli/init.py | 12 ++++++------ hud/clients/fastmcp.py | 9 +++++++-- hud/clients/mcp_use.py | 4 +++- hud/otel/instrumentation.py | 3 ++- 8 files changed, 52 insertions(+), 47 deletions(-) diff --git a/examples/openai_2048.py b/examples/openai_2048.py index 387b7dd5e..000248b06 100644 --- a/examples/openai_2048.py +++ b/examples/openai_2048.py @@ -30,16 +30,16 @@ async def main(): # Initialize OpenAI client with environment variables base_url = os.getenv("OPENAI_BASE_URL") api_key = os.getenv("OPENAI_API_KEY") - + openai_client = AsyncOpenAI( base_url=base_url if base_url else None, # None will use default OpenAI endpoint api_key=api_key, ) - + mcp_config = { "local": { "command": "docker", - "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"] + "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:latest"], } } @@ -67,14 +67,17 @@ async def main(): task = Task( prompt="""Aim for the 128 tile (atleast 800 points!)""", mcp_config=mcp_config, - setup_tool={"name": "setup","arguments": {"name": "board", "arguments": {"board_size": 4}},}, # type: ignore - evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore + setup_tool={ + "name": "setup", + "arguments": {"name": "board", "arguments": {"board_size": 4}}, + }, # type: ignore + evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore ) # Initialize MCP client client = MCPClient(mcp_config=task.mcp_config) - model_name = "gpt-5-mini" # Replace with your model name + model_name = "gpt-5-mini" # Replace with your model name # Create OpenAI agent with the text-2048 game tools agent = GenericOpenAIChatAgent( @@ -92,12 +95,12 @@ async def main(): try: print("šŸŽ® Starting 2048 game with OpenAI agent...") print(f"šŸ¤– Model: {agent.model_name}") - print("="*50) - + print("=" * 50) + result = await agent.run(task, max_steps=-1) - + # Display results - print("="*50) + print("=" * 50) print(f"āœ… Game completed!") print(f"šŸ† Final Score/Max Tile: {result.reward}") if result.info: @@ -106,8 +109,8 @@ async def main(): # Display conversation history print("šŸ—£ļø Conversation History:") for i, msg in enumerate(agent.conversation_history): - print(f" {i+1} : {msg}") - print("-"*30) + print(f" {i + 1} : {msg}") + print("-" * 30) except Exception as e: print(f"āŒ Error during game: {e}") @@ -116,5 +119,4 @@ async def main(): if __name__ == "__main__": - - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py index ff41ac8e6..c25a5f0e1 100644 --- a/examples/openai_browser_2048.py +++ b/examples/openai_browser_2048.py @@ -31,17 +31,17 @@ async def main(): # Initialize OpenAI client with environment variables base_url = os.getenv("OPENAI_BASE_URL") api_key = os.getenv("OPENAI_API_KEY") - + openai_client = AsyncOpenAI( base_url=base_url if base_url else None, api_key=api_key, ) - + # Configure the browser-2048 environment mcp_config = { "local": { "command": "docker", - "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"] + "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"], } } @@ -83,10 +83,7 @@ async def main(): Take screenshots to see the game board, then make strategic moves using the browser interface. You can use arrow keys or mouse gestures to move tiles.""", mcp_config=mcp_config, - setup_tool={ - "name": "launch_app", - "arguments": {"app_name": "2048"} - }, # type: ignore + setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}}, # type: ignore evaluate_tool={ "name": "evaluate", "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}}, @@ -96,14 +93,14 @@ async def main(): # Initialize MCP client client = MCPClient(mcp_config=task.mcp_config) - model_name = "z-ai/glm-4.5v" + model_name = "gpt-5-mini" # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc... # Create OpenAI agent with browser automation tools agent = GenericOpenAIChatAgent( mcp_client=client, openai_client=openai_client, model_name=model_name, - allowed_tools=["computer"], # Computer tool for browser automation + allowed_tools=["computer"], parallel_tool_calls=False, system_prompt=system_prompt, ) @@ -116,12 +113,12 @@ async def main(): print("šŸŽ® Starting browser-based 2048 game with OpenAI agent...") print(f"šŸ¤– Model: {agent.model_name}") print(f"🌐 Browser environment running on localhost:8080") - print("="*50) - + print("=" * 50) + result = await agent.run(task, max_steps=10) - + # Display results - print("="*50) + print("=" * 50) print(f"āœ… Game completed!") print(f"šŸ† Final Score/Max Tile: {result.reward}") if result.info: @@ -134,4 +131,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index a370ba00d..3562d97e7 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -103,7 +103,7 @@ def get_tool_schemas(self) -> list[dict]: ) async def get_response(self, messages: list[Any]) -> AgentResponse: """Send chat request to OpenAI and convert the response.""" - + # Convert MCP tool schemas to OpenAI format mcp_schemas = self.get_tool_schemas() @@ -117,15 +117,15 @@ async def get_response(self, messages: list[Any]) -> AgentResponse: choice = response.choices[0] msg = choice.message - + assistant_msg: dict[str, Any] = {"role": "assistant"} - + if msg.content: assistant_msg["content"] = msg.content - + if msg.tool_calls: assistant_msg["tool_calls"] = msg.tool_calls - + messages.append(assistant_msg) # Store the complete conversation history diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 92626cbc7..c090cf4a8 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -111,8 +111,7 @@ async def run_single_task( ) raise typer.Exit(1) from e - agent_config: dict[str, Any] = { - } + agent_config: dict[str, Any] = {} if allowed_tools: agent_config["allowed_tools"] = allowed_tools @@ -240,8 +239,7 @@ async def run_full_dataset( ) raise typer.Exit(1) from e - agent_config: dict[str, Any] = { - } + agent_config: dict[str, Any] = {} if allowed_tools: agent_config["allowed_tools"] = allowed_tools diff --git a/hud/cli/init.py b/hud/cli/init.py index c7404e088..a8de9464f 100644 --- a/hud/cli/init.py +++ b/hud/cli/init.py @@ -139,7 +139,7 @@ async def evaluate(target: int = 10) -> EvaluationResult: mcp.run() ''' -TASKS_JSON_TEMPLATE = '''[ +TASKS_JSON_TEMPLATE = """[ {{ "prompt": "Increment the counter to reach 10", "mcp_config": {{ @@ -159,7 +159,7 @@ async def evaluate(target: int = 10) -> EvaluationResult: }} }} ] -''' +""" TEST_TASK_TEMPLATE = '''#!/usr/bin/env python """Simple example of running tasks from tasks.json. @@ -210,7 +210,7 @@ async def main(): asyncio.run(main()) ''' -NOTEBOOK_TEMPLATE = '''{{ +NOTEBOOK_TEMPLATE = """{{ "cells": [ {{ "cell_type": "markdown", @@ -427,9 +427,9 @@ async def main(): "nbformat": 4, "nbformat_minor": 4 }} -''' +""" -README_TEMPLATE = '''# {title} +README_TEMPLATE = """# {title} A minimal HUD environment demonstrating the Task pattern with a simple counter. @@ -510,7 +510,7 @@ async def main(): **Note**: Only public HuggingFace datasets appear as leaderboards! šŸ“š Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards) -''' +""" def sanitize_name(name: str) -> str: diff --git a/hud/clients/fastmcp.py b/hud/clients/fastmcp.py index 56b8bb223..c9f8b992d 100644 --- a/hud/clients/fastmcp.py +++ b/hud/clients/fastmcp.py @@ -106,8 +106,13 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None: # Configure validation for output schemas based on client setting try: - if hasattr(self._client, "_session_state") and self._client._session_state.session is not None: # noqa: E501 - self._client._session_state.session._validate_structured_outputs = self._strict_validation # noqa: E501 + if ( + hasattr(self._client, "_session_state") + and self._client._session_state.session is not None + ): # noqa: E501 + self._client._session_state.session._validate_structured_outputs = ( + self._strict_validation + ) # noqa: E501 except ImportError: pass diff --git a/hud/clients/mcp_use.py b/hud/clients/mcp_use.py index b81b714e2..2f9e94ce0 100644 --- a/hud/clients/mcp_use.py +++ b/hud/clients/mcp_use.py @@ -79,7 +79,9 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None: and hasattr(session.connector, "client_session") and session.connector.client_session is not None ): - session.connector.client_session._validate_structured_outputs = self._strict_validation # noqa: E501 + session.connector.client_session._validate_structured_outputs = ( + self._strict_validation + ) # noqa: E501 except ImportError: # ValidationOptions may not be available in some mcp versions pass diff --git a/hud/otel/instrumentation.py b/hud/otel/instrumentation.py index c7c72b10d..ad30f5d37 100644 --- a/hud/otel/instrumentation.py +++ b/hud/otel/instrumentation.py @@ -32,8 +32,9 @@ def install_mcp_instrumentation(provider: TracerProvider) -> None: try: # First, patch the _instruments to use our fork import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst + mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",) - + from opentelemetry.instrumentation.mcp.instrumentation import ( McpInstrumentor, ) From 5e4d74534bd23cdce8c49e2c6e85ceae7688052c Mon Sep 17 00:00:00 2001 From: Jaideep Date: Thu, 4 Sep 2025 17:35:56 -0700 Subject: [PATCH 5/7] fix: image handling in OpenAI chat agent --- examples/openai_2048.py | 6 +- examples/openai_browser_2048.py | 23 +++-- hud/agents/openai_chat_generic.py | 140 ++++++++++++++++++++++++------ 3 files changed, 132 insertions(+), 37 deletions(-) diff --git a/examples/openai_2048.py b/examples/openai_2048.py index 000248b06..37b1cc863 100644 --- a/examples/openai_2048.py +++ b/examples/openai_2048.py @@ -65,13 +65,13 @@ async def main(): # Define the task with game setup and evaluation task = Task( - prompt="""Aim for the 128 tile (atleast 800 points!)""", + prompt="""Aim for the 128 tile (atleast a score of 800!)""", mcp_config=mcp_config, setup_tool={ "name": "setup", "arguments": {"name": "board", "arguments": {"board_size": 4}}, }, # type: ignore - evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {}}}, # type: ignore + evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 128}}}, # type: ignore ) # Initialize MCP client @@ -97,7 +97,7 @@ async def main(): print(f"šŸ¤– Model: {agent.model_name}") print("=" * 50) - result = await agent.run(task, max_steps=-1) + result = await agent.run(task, max_steps=100) # Display results print("=" * 50) diff --git a/examples/openai_browser_2048.py b/examples/openai_browser_2048.py index c25a5f0e1..e9a3723c9 100644 --- a/examples/openai_browser_2048.py +++ b/examples/openai_browser_2048.py @@ -55,9 +55,10 @@ async def main(): - Game ends when grid is full and no merges possible BROWSER INTERACTION USING THE COMPUTER TOOL: -1. TAKE SCREENSHOTS: +1. FIRST TURN ONLY - TAKE SCREENSHOT: Use: computer(action="screenshot") - This captures the current game state + This captures the initial game state. Only needed for your first turn. + After that, the environment will automatically return an image with each successful move. 2. MAKE MOVES - Use arrow keys by calling the computer tool with action="press": - Move UP: computer(action="press", keys=["up"]) @@ -66,9 +67,8 @@ async def main(): - Move RIGHT: computer(action="press", keys=["right"]) CRITICAL RULES: -- Take a screenshot first to see the board state at the start of the game - Make exactly ONE move per turn using the press action with arrow keys -- Continue playing until you reach the target or the game ends +- Continue playing until you reach the target or the game ends, no need to ask the user for confirmation. Strategy tips: - Keep your highest tiles in a corner @@ -79,9 +79,9 @@ async def main(): # Define the task with browser game setup and evaluation task = Task( prompt="""Play the browser-based 2048 game and try to reach the 128 tile. - - Take screenshots to see the game board, then make strategic moves using the browser interface. - You can use arrow keys or mouse gestures to move tiles.""", + + Start by taking a screenshot to see the initial game board, then make strategic moves using arrow keys. + After your first screenshot, the game board will be automatically shown after each successful move.""", mcp_config=mcp_config, setup_tool={"name": "launch_app", "arguments": {"app_name": "2048"}}, # type: ignore evaluate_tool={ @@ -93,7 +93,7 @@ async def main(): # Initialize MCP client client = MCPClient(mcp_config=task.mcp_config) - model_name = "gpt-5-mini" # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc... + model_name = "z-ai/glm-4.5v" # "z-ai/glm-4.5v", "Qwen/Qwen2.5-VL-7B-Instruct" etc... # Create OpenAI agent with browser automation tools agent = GenericOpenAIChatAgent( @@ -115,7 +115,7 @@ async def main(): print(f"🌐 Browser environment running on localhost:8080") print("=" * 50) - result = await agent.run(task, max_steps=10) + result = await agent.run(task, max_steps=100) # Display results print("=" * 50) @@ -124,6 +124,11 @@ async def main(): if result.info: print(f"šŸ“Š Game Stats: {result.info}") + print("\nšŸ“ Full interaction trace:") + for i, msg in enumerate(agent.conversation_history): + print(f" {i + 1} : {msg}") + print("-" * 30) + except Exception as e: print(f"āŒ Error during game: {e}") finally: diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index 3562d97e7..f4a1aec5e 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -70,27 +70,84 @@ async def get_system_messages(self) -> list[Any]: async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]: """Format blocks for OpenAI.""" - return [ - { - "role": "user", - "content": [ - {"type": "text", "text": block.text} - for block in blocks - if isinstance(block, types.TextContent) - ], - }, - ] + content = [] + for block in blocks: + if isinstance(block, types.TextContent): + content.append({"type": "text", "text": block.text}) + elif isinstance(block, types.ImageContent): + content.append({ + "type": "image_url", + "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"} + }) + + return [{"role": "user", "content": content}] + + def _sanitize_schema_for_openai(self, schema: dict) -> dict: + """Convert MCP JSON Schema to OpenAI-compatible format. + + Handles unsupported features like anyOf and prefixItems. + """ + if not isinstance(schema, dict): + return schema + + sanitized = {} + + for key, value in schema.items(): + if key == "anyOf" and isinstance(value, list): + # Handle anyOf patterns (usually for nullable fields) + non_null_types = [v for v in value if not (isinstance(v, dict) and v.get("type") == "null")] + if non_null_types: + # Use the first non-null type + sanitized.update(self._sanitize_schema_for_openai(non_null_types[0])) + else: + sanitized["type"] = "string" # Fallback + + elif key == "prefixItems": + # Convert prefixItems to simple items + sanitized["type"] = "array" + if isinstance(value, list) and value: + # Use the type from the first item as the items schema + first_item = value[0] + if isinstance(first_item, dict): + sanitized["items"] = {"type": first_item.get("type", "string")} + else: + sanitized["items"] = {"type": "string"} + + elif key == "properties" and isinstance(value, dict): + # Recursively sanitize property schemas + sanitized[key] = { + prop_name: self._sanitize_schema_for_openai(prop_schema) + for prop_name, prop_schema in value.items() + } + + elif key == "items" and isinstance(value, dict): + # Recursively sanitize items schema + sanitized[key] = self._sanitize_schema_for_openai(value) + + elif key in ("type", "description", "enum", "required", "default", + "minimum", "maximum", "minItems", "maxItems"): + # These are supported by OpenAI + sanitized[key] = value + + return sanitized or {"type": "object"} def get_tool_schemas(self) -> list[dict]: tool_schemas = super().get_tool_schemas() openai_tools = [] for schema in tool_schemas: + parameters = schema.get("parameters", {}) + + if parameters: + sanitized_params = self._sanitize_schema_for_openai(parameters) + else: + sanitized_params = {"type": "object", "properties": {}} + openai_tool = { "type": "function", "function": { "name": schema["name"], "description": schema.get("description", ""), - "parameters": schema.get("parameters", {"type": "object", "properties": {}}), + "parameters": sanitized_params, }, } openai_tools.append(openai_tool) @@ -151,22 +208,55 @@ async def format_tool_results( tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult], ) -> list[Any]: - """Render MCP tool results as OpenAI ``role=tool`` messages.""" + """Render MCP tool results as OpenAI messages. + + Note: OpenAI tool messages only support string content. + When images are present, we return both a tool message and a user message. + """ rendered: list[dict[str, Any]] = [] for call, res in zip(tool_calls, tool_results, strict=False): - if res.structuredContent: - content = json.dumps(res.structuredContent) - else: - # Concatenate any TextContent blocks - content = "".join( - c.text # type: ignore[attr-defined] - for c in res.content - if hasattr(c, "text") - ) - tool_msg = { + # Use structuredContent.result if available, otherwise use content + items = res.content + if res.structuredContent and isinstance(res.structuredContent, dict): + items = res.structuredContent.get("result", res.content) + + # Separate text and image content + text_parts = [] + image_parts = [] + + for item in items: + if isinstance(item, dict): + if item.get("type") == "text": + text_parts.append(item.get("text", "")) + elif item.get("type") == "image": + image_parts.append({ + "type": "image_url", + "image_url": { + "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}" + } + }) + elif isinstance(item, types.TextContent): + text_parts.append(item.text) + elif isinstance(item, types.ImageContent): + image_parts.append({ + "type": "image_url", + "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"} + }) + + text_content = "".join(text_parts) if text_parts else "Tool executed successfully" + rendered.append({ "role": "tool", "tool_call_id": call.id, - "content": content or "", # Ensure content is never None - } - rendered.append(tool_msg) + "content": text_content, + }) + + # If there are images, add them as a separate user message + if image_parts: + # Add a user message with the images + content_with_images = [{"type": "text", "text": "Tool returned the following:"}] + image_parts + rendered.append({ + "role": "user", + "content": content_with_images, + }) + return rendered From ceb4f8f1277071c84f933d8cea6057daa72aa481 Mon Sep 17 00:00:00 2001 From: Jaideep Date: Thu, 4 Sep 2025 17:40:25 -0700 Subject: [PATCH 6/7] ruff --- examples/openai_2048.py | 5 +- hud/agents/openai_chat_generic.py | 109 ++++++++++++++++++------------ 2 files changed, 70 insertions(+), 44 deletions(-) diff --git a/examples/openai_2048.py b/examples/openai_2048.py index 37b1cc863..f83aab590 100644 --- a/examples/openai_2048.py +++ b/examples/openai_2048.py @@ -71,7 +71,10 @@ async def main(): "name": "setup", "arguments": {"name": "board", "arguments": {"board_size": 4}}, }, # type: ignore - evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 128}}}, # type: ignore + evaluate_tool={ + "name": "evaluate", + "arguments": {"name": "max_number", "arguments": {"target": 128}}, + }, # type: ignore ) # Initialize MCP client diff --git a/hud/agents/openai_chat_generic.py b/hud/agents/openai_chat_generic.py index f4a1aec5e..8e61be940 100644 --- a/hud/agents/openai_chat_generic.py +++ b/hud/agents/openai_chat_generic.py @@ -75,33 +75,37 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]: if isinstance(block, types.TextContent): content.append({"type": "text", "text": block.text}) elif isinstance(block, types.ImageContent): - content.append({ - "type": "image_url", - "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"} - }) - + content.append( + { + "type": "image_url", + "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"}, + } + ) + return [{"role": "user", "content": content}] def _sanitize_schema_for_openai(self, schema: dict) -> dict: """Convert MCP JSON Schema to OpenAI-compatible format. - + Handles unsupported features like anyOf and prefixItems. """ if not isinstance(schema, dict): return schema - + sanitized = {} - + for key, value in schema.items(): if key == "anyOf" and isinstance(value, list): # Handle anyOf patterns (usually for nullable fields) - non_null_types = [v for v in value if not (isinstance(v, dict) and v.get("type") == "null")] + non_null_types = [ + v for v in value if not (isinstance(v, dict) and v.get("type") == "null") + ] if non_null_types: # Use the first non-null type sanitized.update(self._sanitize_schema_for_openai(non_null_types[0])) else: sanitized["type"] = "string" # Fallback - + elif key == "prefixItems": # Convert prefixItems to simple items sanitized["type"] = "array" @@ -112,23 +116,32 @@ def _sanitize_schema_for_openai(self, schema: dict) -> dict: sanitized["items"] = {"type": first_item.get("type", "string")} else: sanitized["items"] = {"type": "string"} - + elif key == "properties" and isinstance(value, dict): # Recursively sanitize property schemas sanitized[key] = { prop_name: self._sanitize_schema_for_openai(prop_schema) for prop_name, prop_schema in value.items() } - + elif key == "items" and isinstance(value, dict): # Recursively sanitize items schema sanitized[key] = self._sanitize_schema_for_openai(value) - - elif key in ("type", "description", "enum", "required", "default", - "minimum", "maximum", "minItems", "maxItems"): + + elif key in ( + "type", + "description", + "enum", + "required", + "default", + "minimum", + "maximum", + "minItems", + "maxItems", + ): # These are supported by OpenAI sanitized[key] = value - + return sanitized or {"type": "object"} def get_tool_schemas(self) -> list[dict]: @@ -136,12 +149,12 @@ def get_tool_schemas(self) -> list[dict]: openai_tools = [] for schema in tool_schemas: parameters = schema.get("parameters", {}) - + if parameters: sanitized_params = self._sanitize_schema_for_openai(parameters) else: sanitized_params = {"type": "object", "properties": {}} - + openai_tool = { "type": "function", "function": { @@ -209,7 +222,7 @@ async def format_tool_results( tool_results: list[MCPToolResult], ) -> list[Any]: """Render MCP tool results as OpenAI messages. - + Note: OpenAI tool messages only support string content. When images are present, we return both a tool message and a user message. """ @@ -219,44 +232,54 @@ async def format_tool_results( items = res.content if res.structuredContent and isinstance(res.structuredContent, dict): items = res.structuredContent.get("result", res.content) - + # Separate text and image content text_parts = [] image_parts = [] - + for item in items: if isinstance(item, dict): if item.get("type") == "text": text_parts.append(item.get("text", "")) elif item.get("type") == "image": - image_parts.append({ - "type": "image_url", - "image_url": { - "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}" + image_parts.append( + { + "type": "image_url", + "image_url": { + "url": f"data:{item.get('mimeType', 'image/png')};base64,{item.get('data', '')}" + }, } - }) + ) elif isinstance(item, types.TextContent): text_parts.append(item.text) elif isinstance(item, types.ImageContent): - image_parts.append({ - "type": "image_url", - "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"} - }) - + image_parts.append( + { + "type": "image_url", + "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"}, + } + ) + text_content = "".join(text_parts) if text_parts else "Tool executed successfully" - rendered.append({ - "role": "tool", - "tool_call_id": call.id, - "content": text_content, - }) - + rendered.append( + { + "role": "tool", + "tool_call_id": call.id, + "content": text_content, + } + ) + # If there are images, add them as a separate user message if image_parts: # Add a user message with the images - content_with_images = [{"type": "text", "text": "Tool returned the following:"}] + image_parts - rendered.append({ - "role": "user", - "content": content_with_images, - }) - + content_with_images = [ + {"type": "text", "text": "Tool returned the following:"} + ] + image_parts + rendered.append( + { + "role": "user", + "content": content_with_images, + } + ) + return rendered From 585d0dbc359a37661a8a7118c344a8a199ca7b05 Mon Sep 17 00:00:00 2001 From: "Parth A. Patel" Date: Thu, 4 Sep 2025 17:55:22 -0700 Subject: [PATCH 7/7] nit: fix ruff checks --- hud/datasets/execution/parallel.py | 10 +++++----- hud/otel/exporters.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hud/datasets/execution/parallel.py b/hud/datasets/execution/parallel.py index dfa385219..45311b915 100644 --- a/hud/datasets/execution/parallel.py +++ b/hud/datasets/execution/parallel.py @@ -75,8 +75,8 @@ def _process_worker( pass # Set up signal handler for clean interruption - def signal_handler(signum, frame): - logger.warning(f"Worker {worker_id}: Received interrupt signal") + def signal_handler(signum: int, frame: Any) -> None: + logger.warning("Worker %s: Received interrupt signal", worker_id) # Raise KeyboardInterrupt to actually interrupt the worker raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user") @@ -171,7 +171,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in results = await asyncio.gather(*tasks, return_exceptions=False) return results except asyncio.CancelledError: - logger.info(f"Worker {worker_id}: Tasks cancelled due to interruption") + logger.info("Worker %s: Tasks cancelled due to interruption", worker_id) # Return error results for all tasks return [ ( @@ -208,7 +208,7 @@ async def process_single_task(index: int, task_dict: dict[str, Any]) -> tuple[in return results except KeyboardInterrupt: - logger.info(f"Worker {worker_id}: Interrupted by user, stopping gracefully") + logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id) # Return partial results for tasks that completed partial_results = [] for idx, _ in task_batch: @@ -489,7 +489,7 @@ async def run_dataset_parallel_manual( "content": "Task interrupted (Ctrl+C)", } - logger.info(f"Interrupted after {completed}/{total} tasks") + logger.info("Interrupted after %s/%s tasks", completed, total) raise # Re-raise to propagate the interrupt finally: diff --git a/hud/otel/exporters.py b/hud/otel/exporters.py index 4d5dd9a1e..82e2ac954 100644 --- a/hud/otel/exporters.py +++ b/hud/otel/exporters.py @@ -14,9 +14,9 @@ import contextlib import json import logging +import time from collections import defaultdict from datetime import UTC, datetime -import time from typing import TYPE_CHECKING, Any from mcp.types import ClientRequest, ServerResult