hud-evals · lorenss-m · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/examples/agents_tools/simple_task_example.py b/examples/agents_tools/simple_task_example.py
@@ -18,7 +18,7 @@
 import asyncio
 import logging
 from hud.mcp import ClaudeMCPAgent
-from hud.task import TaskConfig
+from hud.datasets import TaskConfig
 from mcp.types import CallToolRequestParams as MCPToolCall
 from mcp_use import MCPClient
 

diff --git a/examples/environments/gmail_local.ipynb b/examples/environments/gmail_local.ipynb
diff --git a/examples/environments/gmail_local.py b/examples/environments/gmail_local.py
@@ -1,25 +1,25 @@
 import asyncio
 import hud
-from hud.mcp import ClaudeMCPAgent, OpenAIMCPAgent
-from hud.task import TaskConfig
-from mcp_use import MCPClient
+from hud.datasets import TaskConfig
+from hud.mcp import ClaudeMCPAgent, OpenAIMCPAgent, MCPClient
+from hud.mcp.base import AgentResult
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
 
 
 async def main():
-    with hud.trace("gmail_local"):
+    with hud.trace("Yutori Gmail Task Trace"):
         task_dict = {
-            "prompt": "Open Sent mail, search for the Series B pitch deck, forward it to [email protected], and mark the original message as important.",
+            "prompt": "Open Sent mail, find the Series B pitch deck email, forward it to [email protected], and mark the original message as important.",
             "mcp_config": {
                 "gmail": {
-                    "command": "docker",
+                    "command": "/bin/sh",
                     "args": [
-                        "run",
-                        "-i",  # interactive mode for stdio MCP server
-                        "--rm",  # remove the container after it exits
-                        "-p",
-                        "6080:6080",  # map port 6080 to the host for noVNC
-                        "hudpython/gmail-clone:latest",  # use hud gmail image
-                    ],
+                        "-c",
+                        "docker run -i --rm -p 6080:6080 gmail 2>/dev/null"
+                    ]
                 }
             },
             "setup_tool": {
@@ -37,36 +37,44 @@ async def main():
             "metadata": {"id": "forward-series-b-deck-to-billgates"},
         }
 
+        # Update the prompt to give a disclaimer that this is a testing environment (anthropic doesn't like doing some actions)
+        # Feel free to modify this
+        task_dict["prompt"] = (
+            task_dict["prompt"]
+            + " This is a mock sandbox environment for evaluations, so feel free to execute any actions you want and do not ask me for confirmation before executing actions."
+        )
+
         # Create TaskConfig from dict
         task = TaskConfig(**task_dict)
 
         print("📡 Defining the environment...")
-        print("🔴 See the agent live at http://localhost:6080/vnc.html")
-        client = MCPClient.from_dict({"mcp_config": task.mcp_config})
+        client = MCPClient(mcp_config=task.mcp_config)
+
 
         agent = ClaudeMCPAgent(  # or OpenAIMCPAgent
             mcp_client=client,
-            model="claude-3-7-sonnet-20250219",
-            allowed_tools=["computer"],
+            model="claude-sonnet-4-20250514",
+            # Allowing anthropic_computer tool to be used because we're using ClaudeMCPAgent
+            allowed_tools=["anthropic_computer"], # Check our hud/tools/computer/anthropic.py
             initial_screenshot=True,
         )
 
         print(f"📋 Task: {task.prompt}")
         print(f"⚙️  Setup: {task.setup_tool}")
-        print(f"📊 Evaluate: {task.evaluate_tool}")
-
+        print(f"📊 Evaluate: {task.evaluate_tool}")        
         # Run the task
         print("🚀 Running the task...")
-        eval_result = await agent.run(task, max_steps=10)
-        print(f"🎉 Task Result: {eval_result}")
+        print("🔴 See the agent live at http://localhost:6080/vnc.html")
+        eval_result: AgentResult = await agent.run(task, max_steps=30)
 
         # Show formatted results
-        reward = eval_result.get("reward", 0.0)
-        print(f"   🏆 Reward: {reward}")
+        print(f"🎉 Task Result:")
+        print(f"   🏆 Reward: {eval_result.reward}")
+        print(f"   🔍 Content: {eval_result.content[:1000] if eval_result.content else 'No content'}...")
 
         # Clean up
         print("\n🧹 Cleaning up...")
-        await client.close_all_sessions()
+        await client.close()
         print("✅ Done!")