Skip to content

Commit 8855310

Browse files
committed
Update OmniMCP for independent operation
This commit makes OmniMCP more independent from OpenAdapt: 1. Create a local config.py to replace openadapt.config dependency 2. Use the Anthropic SDK directly instead of openadapt.drivers.anthropic 3. Update the Claude model to use latest versions (3.5/3.7) 4. Replace run_omnimcp.py with a local implementation 5. Update imports throughout the codebase to use local modules
1 parent c435c4a commit 8855310

File tree

5 files changed

+284
-27
lines changed

5 files changed

+284
-27
lines changed

omnimcp/omnimcp/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
# Setup path to include OpenAdapt modules
44
from . import pathing
55

6-
# Import from OpenAdapt modules
7-
from openadapt.omnimcp import OmniMCP
8-
from openadapt.run_omnimcp import main
6+
# Import from local modules
7+
from .omnimcp import OmniMCP
98

109
__version__ = "0.1.0"

omnimcp/omnimcp/computer_use.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
# Import pathing first to ensure OpenAdapt is in the path
2727
from . import pathing
28-
from openadapt.config import config
28+
from omnimcp.config import config
2929

3030

3131
def ensure_docker_installed():

omnimcp/omnimcp/config.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Configuration for OmniMCP.
2+
3+
This module provides a simple configuration system for OmniMCP.
4+
Configuration values can be set via environment variables.
5+
"""
6+
7+
import os
8+
from typing import Any, Dict
9+
10+
11+
class Config:
12+
"""Configuration for OmniMCP."""
13+
14+
def __init__(self):
15+
"""Initialize configuration from environment variables."""
16+
# Anthropic API
17+
self.ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "<ANTHROPIC_API_KEY>")
18+
self.CLAUDE_MODEL = os.getenv("CLAUDE_MODEL", "claude-3-5-sonnet-latest")
19+
self.CLAUDE_MODEL_ALTERNATIVES = [
20+
"claude-3-7-sonnet-20250229",
21+
"claude-3-5-sonnet-latest"
22+
]
23+
24+
# OmniParser
25+
self.OMNIPARSER_URL = os.getenv("OMNIPARSER_URL", "http://localhost:8000")
26+
27+
# AWS (for OmniParser deployment)
28+
self.AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "")
29+
self.AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "")
30+
self.AWS_REGION = os.getenv("AWS_REGION", "us-west-2")
31+
32+
# Deployment
33+
self.PROJECT_NAME = os.getenv("PROJECT_NAME", "omnimcp")
34+
35+
# MCP Server
36+
self.MCP_PORT = int(os.getenv("MCP_PORT", "8765"))
37+
38+
39+
# Create a singleton instance
40+
config = Config()

omnimcp/omnimcp/omnimcp.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,11 @@
3434
from loguru import logger
3535
from pynput import keyboard, mouse
3636

37+
from anthropic import Anthropic
38+
3739
from openadapt import utils
3840
from omnimcp.adapters.omniparser import OmniParserProvider
39-
from openadapt.config import config
40-
from openadapt.drivers import anthropic
41+
from omnimcp.config import config
4142

4243

4344
class ScreenElement:
@@ -708,12 +709,16 @@ async def describe_screen_with_claude(self) -> str:
708709
Describe the overall screen, main elements, and possible interactions a user might perform.
709710
"""
710711

711-
# Get response from Claude
712-
response = anthropic.prompt(
713-
prompt=prompt,
714-
system_prompt=system_prompt,
715-
api_key=self.claude_api_key
716-
)
712+
# Get response from Claude using Anthropic SDK
713+
client = Anthropic(api_key=self.claude_api_key)
714+
response = client.messages.create(
715+
model=config.CLAUDE_MODEL,
716+
max_tokens=1000,
717+
system=system_prompt,
718+
messages=[
719+
{"role": "user", "content": prompt}
720+
]
721+
).content[0].text
717722

718723
return response
719724

@@ -768,12 +773,16 @@ async def describe_element_with_claude(self, element: ScreenElement) -> str:
768773
Describe what this element is, what it does, and how a user might interact with it.
769774
"""
770775

771-
# Get response from Claude
772-
response = anthropic.prompt(
773-
prompt=prompt,
774-
system_prompt=system_prompt,
775-
api_key=self.claude_api_key
776-
)
776+
# Get response from Claude using Anthropic SDK
777+
client = Anthropic(api_key=self.claude_api_key)
778+
response = client.messages.create(
779+
model=config.CLAUDE_MODEL,
780+
max_tokens=1000,
781+
system=system_prompt,
782+
messages=[
783+
{"role": "user", "content": prompt}
784+
]
785+
).content[0].text
777786

778787
return response
779788

@@ -811,12 +820,16 @@ def prompt_claude(self, prompt: str, system_prompt: Optional[str] = None) -> str
811820
You have access to a structured description of the current screen through the Model Context Protocol.
812821
Analyze the UI elements and provide clear, concise guidance based on the current screen state."""
813822

814-
# Get response from Claude
815-
response = anthropic.prompt(
816-
prompt=full_prompt,
817-
system_prompt=system_prompt,
818-
api_key=self.claude_api_key
819-
)
823+
# Get response from Claude using Anthropic SDK
824+
client = Anthropic(api_key=self.claude_api_key)
825+
response = client.messages.create(
826+
model=config.CLAUDE_MODEL,
827+
max_tokens=1000,
828+
system=system_prompt,
829+
messages=[
830+
{"role": "user", "content": full_prompt}
831+
]
832+
).content[0].text
820833

821834
return response
822835

omnimcp/omnimcp/run_omnimcp.py

Lines changed: 208 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,215 @@
1-
"""Entry point for OmniMCP CLI."""
1+
"""Entry point for OmniMCP CLI.
2+
3+
This module provides a command-line interface for OmniMCP, allowing you to run
4+
it in various modes (CLI, MCP server, debug visualizations).
5+
"""
6+
7+
import datetime
8+
import fire
9+
import os
10+
from loguru import logger
211

312
# Setup path to include OpenAdapt modules
413
from . import pathing
14+
from .omnimcp import OmniMCP
15+
from .config import config
16+
17+
18+
class OmniMCPRunner:
19+
"""OmniMCP runner with different modes of operation."""
20+
21+
def cli(
22+
self,
23+
server_url=None,
24+
claude_api_key=None,
25+
use_normalized_coordinates=False,
26+
debug_dir=None,
27+
allow_no_parser=False,
28+
auto_deploy_parser=True,
29+
skip_confirmation=False
30+
):
31+
"""Run OmniMCP in CLI mode.
32+
33+
In CLI mode, you can enter natural language commands directly in the terminal.
34+
OmniMCP will:
35+
1. Take a screenshot
36+
2. Analyze it with OmniParser to identify UI elements
37+
3. Use Claude to decide what action to take based on your command
38+
4. Execute the action (click, type, etc.)
39+
40+
This mode is convenient for testing and doesn't require Claude Desktop.
41+
42+
Args:
43+
server_url: URL of the OmniParser server
44+
claude_api_key: Claude API key (if not provided, uses value from config.py)
45+
use_normalized_coordinates: Use normalized (0-1) coordinates instead of pixels
46+
debug_dir: Directory to save debug visualizations
47+
allow_no_parser: If True, continue even if OmniParser is not available
48+
auto_deploy_parser: If True, attempt to deploy OmniParser if not available (default: True)
49+
skip_confirmation: If True, skip user confirmation for OmniParser deployment
50+
"""
51+
# Create OmniMCP instance
52+
omnimcp = OmniMCP(
53+
server_url=server_url,
54+
claude_api_key=claude_api_key, # Will use config.ANTHROPIC_API_KEY if None
55+
use_normalized_coordinates=use_normalized_coordinates,
56+
allow_no_parser=allow_no_parser,
57+
auto_deploy_parser=auto_deploy_parser,
58+
skip_confirmation=skip_confirmation
59+
)
60+
61+
# Handle debug directory if specified
62+
if debug_dir:
63+
os.makedirs(debug_dir, exist_ok=True)
64+
65+
# Take initial screenshot and save debug visualization
66+
logger.info(f"Saving debug visualization to {debug_dir}")
67+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
68+
debug_path = os.path.join(debug_dir, f"initial_state_{timestamp}.png")
69+
omnimcp.update_visual_state()
70+
omnimcp.save_visual_debug(debug_path)
71+
72+
logger.info("Starting OmniMCP in CLI mode")
73+
logger.info(f"Coordinate mode: {'normalized (0-1)' if use_normalized_coordinates else 'absolute (pixels)'}")
74+
75+
# Run CLI interaction loop
76+
omnimcp.run_interactive()
77+
78+
def server(
79+
self,
80+
server_url=None,
81+
claude_api_key=None,
82+
use_normalized_coordinates=False,
83+
debug_dir=None,
84+
allow_no_parser=False,
85+
auto_deploy_parser=True,
86+
skip_confirmation=False
87+
):
88+
"""Run OmniMCP as an MCP server.
89+
90+
In server mode, OmniMCP provides UI automation tools to Claude through the
91+
Model Control Protocol. The server exposes tools for:
92+
1. Getting the current screen state with UI elements
93+
2. Finding UI elements by description
94+
3. Clicking on elements or coordinates
95+
4. Typing text and pressing keys
96+
97+
To use with Claude Desktop:
98+
1. Configure Claude Desktop to use this server
99+
2. Ask Claude to perform UI tasks
100+
101+
Args:
102+
server_url: URL of the OmniParser server
103+
claude_api_key: Claude API key (if not provided, uses value from config.py)
104+
use_normalized_coordinates: Use normalized (0-1) coordinates instead of pixels
105+
debug_dir: Directory to save debug visualizations
106+
allow_no_parser: If True, continue even if OmniParser is not available
107+
auto_deploy_parser: If True, attempt to deploy OmniParser if not available (default: True)
108+
skip_confirmation: If True, skip user confirmation for OmniParser deployment
109+
"""
110+
# Create OmniMCP instance
111+
omnimcp = OmniMCP(
112+
server_url=server_url,
113+
claude_api_key=claude_api_key, # Will use config.ANTHROPIC_API_KEY if None
114+
use_normalized_coordinates=use_normalized_coordinates,
115+
allow_no_parser=allow_no_parser,
116+
auto_deploy_parser=auto_deploy_parser,
117+
skip_confirmation=skip_confirmation
118+
)
119+
120+
# Handle debug directory if specified
121+
if debug_dir:
122+
os.makedirs(debug_dir, exist_ok=True)
123+
124+
# Take initial screenshot and save debug visualization
125+
logger.info(f"Saving debug visualization to {debug_dir}")
126+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
127+
debug_path = os.path.join(debug_dir, f"initial_state_{timestamp}.png")
128+
omnimcp.update_visual_state()
129+
omnimcp.save_visual_debug(debug_path)
130+
131+
logger.info("Starting OmniMCP Model Control Protocol server")
132+
logger.info(f"Coordinate mode: {'normalized (0-1)' if use_normalized_coordinates else 'absolute (pixels)'}")
133+
134+
# Run MCP server
135+
omnimcp.run_mcp_server()
136+
137+
def debug(
138+
self,
139+
server_url=None,
140+
claude_api_key=None,
141+
use_normalized_coordinates=False,
142+
debug_dir=None,
143+
allow_no_parser=False,
144+
auto_deploy_parser=True,
145+
skip_confirmation=False
146+
):
147+
"""Run OmniMCP in debug mode.
148+
149+
Debug mode takes a screenshot, analyzes it with OmniParser, and saves
150+
a visualization showing the detected UI elements with their descriptions.
151+
152+
This is useful for:
153+
- Understanding what UI elements OmniParser detects
154+
- Debugging issues with element detection
155+
- Fine-tuning OmniParser integration
156+
157+
Args:
158+
server_url: URL of the OmniParser server
159+
claude_api_key: Claude API key (if not provided, uses value from config.py)
160+
use_normalized_coordinates: Use normalized (0-1) coordinates instead of pixels
161+
debug_dir: Directory to save debug visualizations
162+
allow_no_parser: If True, continue even if OmniParser is not available
163+
auto_deploy_parser: If True, attempt to deploy OmniParser if not available (default: True)
164+
skip_confirmation: If True, skip user confirmation for OmniParser deployment
165+
"""
166+
# Create OmniMCP instance
167+
omnimcp = OmniMCP(
168+
server_url=server_url,
169+
claude_api_key=claude_api_key, # Will use config.ANTHROPIC_API_KEY if None
170+
use_normalized_coordinates=use_normalized_coordinates,
171+
allow_no_parser=allow_no_parser,
172+
auto_deploy_parser=auto_deploy_parser,
173+
skip_confirmation=skip_confirmation
174+
)
175+
176+
# Create debug directory if not specified
177+
if not debug_dir:
178+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
179+
debug_dir = os.path.join(os.path.expanduser("~"), "omnimcp_debug", f"debug_{timestamp}")
180+
181+
os.makedirs(debug_dir, exist_ok=True)
182+
logger.info(f"Saving debug visualization to {debug_dir}")
183+
184+
# Generate debug filename
185+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
186+
debug_path = os.path.join(debug_dir, f"screen_state_{timestamp}.png")
187+
188+
# Update visual state and save debug
189+
logger.info("Taking screenshot and analyzing with OmniParser...")
190+
omnimcp.update_visual_state()
191+
omnimcp.save_visual_debug(debug_path)
192+
logger.info(f"Saved debug visualization to {debug_path}")
193+
194+
# Print some stats about detected elements
195+
num_elements = len(omnimcp.visual_state.elements)
196+
logger.info(f"Detected {num_elements} UI elements")
197+
198+
if num_elements > 0:
199+
# Show a few example elements
200+
logger.info("Example elements:")
201+
for i, element in enumerate(omnimcp.visual_state.elements[:5]):
202+
content = element.content[:50] + "..." if len(element.content) > 50 else element.content
203+
logger.info(f" {i+1}. '{content}' at ({element.x1},{element.y1},{element.x2},{element.y2})")
204+
205+
if num_elements > 5:
206+
logger.info(f" ... and {num_elements - 5} more elements")
207+
208+
209+
def main():
210+
"""Main entry point for OmniMCP."""
211+
fire.Fire(OmniMCPRunner)
5212

6-
# Import from OpenAdapt module
7-
from openadapt.run_omnimcp import main
8213

9214
if __name__ == "__main__":
10215
main()

0 commit comments

Comments
 (0)