Skip to content

Commit c16e48e

Browse files
jdchawla29Parth220cursoragent
authored
Grounding (#113)
- Introduced GroundedOpenAIChatAgent for separating visual grounding from reasoning. - Added GroundedComputerTool for resolving element descriptions to coordinates. - Implemented Grounder class for API calls to grounding models and coordinate parsing. - Created configuration and initialization for grounding models. - Updated Docker image version in browser agent example. - Added new grounding examples and configurations in the tools module. - Add tests for GroundedOpenAIChatAgent and GroundedComputerTool Co-authored-by: Parth A. Patel <parthpatel0220@gmail.com> Co-authored-by: Cursor Agent <cursoragent@cursor.com>
1 parent a875101 commit c16e48e

23 files changed

+1883
-17
lines changed

examples/03_browser_agent_loop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ async def main():
3939
mcp_config = {
4040
"local": {
4141
"command": "docker",
42-
"args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.2"],
42+
"args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
4343
}
4444
}
4545

examples/grounded_agent.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
Grounded Agent Example
3+
4+
This example demonstrates the GroundedOpenAIChatAgent that separates:
5+
- Visual grounding (element detection) using a specialized vision model
6+
- High-level reasoning using GPT-4o or similar
7+
8+
Prerequisites:
9+
1. Set your API keys:
10+
export OPENAI_API_KEY=your_openai_key
11+
export OPENROUTER_API_KEY=your_openrouter_key
12+
export HUD_API_KEY=your_hud_key
13+
"""
14+
15+
import asyncio
16+
import os
17+
18+
import hud
19+
from hud.agents.grounded_openai import GroundedOpenAIChatAgent
20+
from hud.settings import settings
21+
from hud.tools.grounding import GrounderConfig
22+
from openai import AsyncOpenAI
23+
24+
25+
async def main():
26+
"""Run the grounded agent example."""
27+
28+
with hud.trace("Grounded Agent Demo"):
29+
# Configure the grounding model
30+
grounder_config = GrounderConfig(
31+
api_base="https://openrouter.ai/api/v1", # OpenRouter API
32+
model="qwen/qwen-2.5-vl-7b-instruct", # Vision model for grounding
33+
api_key=settings.openrouter_api_key,
34+
)
35+
36+
# MCP configuration for environment
37+
mcp_config = {
38+
"local": {
39+
"command": "docker",
40+
"args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
41+
}
42+
}
43+
44+
# Create OpenAI client for planning
45+
openai_client = AsyncOpenAI(
46+
api_key=os.getenv("OPENAI_API_KEY", settings.openai_api_key)
47+
) # can use any OpenAI-compatible endpoint
48+
49+
agent = GroundedOpenAIChatAgent(
50+
grounder_config=grounder_config,
51+
openai_client=openai_client,
52+
model_name="gpt-4o-mini", # Planning model
53+
)
54+
agent.metadata = {}
55+
56+
try:
57+
# Create a task with MCP config
58+
from hud.datasets import Task
59+
60+
form_url = "https://hb.cran.dev/forms/post"
61+
62+
form_prompt = f"""
63+
Fill out the form:
64+
1. Enter "Grounded Test" in the customer name field
65+
2. Enter "555-9876" in the telephone field
66+
3. Type "Testing grounded agent with separated vision and reasoning" in comments
67+
4. Select medium pizza size
68+
5. Choose mushroom as a topping
69+
6. Submit the form
70+
"""
71+
72+
task = Task(
73+
prompt=form_prompt,
74+
mcp_config=mcp_config,
75+
setup_tool={
76+
"name": "playwright",
77+
"arguments": {"action": "navigate", "url": form_url},
78+
},
79+
)
80+
81+
print(f"📋 Task: Form interaction")
82+
print(f"🚀 Running grounded agent...\n")
83+
84+
result = await agent.run(task, max_steps=10)
85+
print(f"Result: {result.content}\n")
86+
87+
except Exception as e:
88+
print(f"Error during agent execution: {e}")
89+
90+
print("\n✨ Grounded agent demo complete!")
91+
92+
93+
if __name__ == "__main__":
94+
asyncio.run(main())

0 commit comments

Comments
 (0)