Skip to content

Commit f673382

Browse files
committed
Consolidate SYSTEM_PROMPT_LABELED_MAC and SYSTEM_PROMPT_LABELED_win
1 parent 6519a12 commit f673382

File tree

2 files changed

+20
-65
lines changed

2 files changed

+20
-65
lines changed

operate/models/apis.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,15 @@
3535
# Load configuration
3636
config = Config()
3737

38+
3839
async def get_next_action(model, messages, objective, session_id):
3940
if config.verbose:
4041
print("[Self-Operating Computer][get_next_action]")
4142
print("[Self-Operating Computer][get_next_action] model", model)
4243
if model == "gpt-4":
4344
return call_gpt_4_vision_preview(messages), None
4445
if model == "gpt-4-with-som":
45-
operation = await call_gpt_4_vision_preview_labeled(messages, objective)
46+
operation = await call_gpt_4_vision_preview_labeled(messages, objective, model)
4647
return operation, None
4748
if model == "gpt-4-with-ocr":
4849
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
@@ -328,12 +329,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
328329
return gpt_4_fallback(messages, objective, model)
329330

330331

331-
async def call_gpt_4_vision_preview_labeled(messages, objective):
332+
async def call_gpt_4_vision_preview_labeled(messages, objective, model):
332333
time.sleep(1)
333-
client = config.initialize_openai()
334334

335-
# Construct the path to the file within the package
336335
try:
336+
client = config.initialize_openai()
337+
338+
confirm_system_prompt(messages, objective, model)
337339
file_path = pkg_resources.resource_filename("operate.models.weights", "best.pt")
338340
yolo_model = YOLO(file_path) # Load your trained model
339341
screenshots_dir = "screenshots"
@@ -500,13 +502,13 @@ def call_ollama_llava(messages):
500502
model="llava",
501503
messages=messages,
502504
)
503-
505+
504506
# Important: Remove the image path from the message history.
505507
# Ollama will attempt to load each image reference and will
506508
# eventually timeout.
507509
messages[-1]["images"] = None
508-
509-
content = response['message']['content'].strip()
510+
511+
content = response["message"]["content"].strip()
510512

511513
if content.startswith("```json"):
512514
content = content[len("```json") :] # Remove starting ```json
@@ -530,7 +532,7 @@ def call_ollama_llava(messages):
530532
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
531533
e,
532534
)
533-
535+
534536
except Exception as e:
535537
print(
536538
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",

operate/models/prompts.py

Lines changed: 10 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@
9999
"""
100100

101101

102-
SYSTEM_PROMPT_LABELED_MAC = """
103-
You are operating a computer, using the same operating system as a human.
102+
SYSTEM_PROMPT_LABELED = """
103+
You are operating a {operating_system} computer, using the same operating system as a human.
104104
105105
From looking at the screen, the objective, and your previous actions, take the next best series of action.
106106
@@ -124,13 +124,13 @@
124124
125125
# Opens Spotlight Search on Mac
126126
[
127-
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }},
127+
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
128128
{{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
129129
]
130130
131131
# Focuses on the address bar in a browser before typing a website
132132
[
133-
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "l"] }},
133+
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
134134
{{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
135135
{{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
136136
]
@@ -149,55 +149,6 @@
149149
Objective: {objective}
150150
"""
151151

152-
SYSTEM_PROMPT_LABELED_WIN_LINUX = """
153-
You are operating a computer, using the same operating system as a human.
154-
155-
From looking at the screen, the objective, and your previous actions, take the next best series of action.
156-
157-
You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
158-
159-
1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
160-
[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format
161-
162-
2. write - Write with your keyboard
163-
[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
164-
165-
3. press - Use a hotkey or press key to operate the computer
166-
[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
167-
168-
4. done - The objective is completed
169-
[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
170-
171-
Return the actions in array format `[]`. You can take just one action or multiple actions.
172-
173-
Here are some helpful combinations:
174-
175-
# Opens Menu Search on Windows and Linux
176-
[
177-
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }},
178-
{{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
179-
]
180-
181-
# Focuses on the address bar in a browser before typing a website
182-
[
183-
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }},
184-
{{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
185-
{{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
186-
]
187-
188-
# Send a "Hello World" message in the chat
189-
[
190-
{{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }},
191-
{{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }},
192-
]
193-
194-
A few important notes:
195-
196-
- Go to Google Docs and Google Sheets by typing in the Chrome Address bar
197-
- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
198-
199-
Objective: {objective}
200-
"""
201152

202153
SYSTEM_PROMPT_OCR = """
203154
You are operating a {operating_system} computer, using the same operating system as a human.
@@ -296,10 +247,12 @@ def get_system_prompt(model, objective):
296247
operating_system = "Linux"
297248

298249
if model == "gpt-4-with-som":
299-
if platform.system() == "Darwin":
300-
prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
301-
else:
302-
prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
250+
prompt = SYSTEM_PROMPT_LABELED.format(
251+
objective=objective,
252+
cmd_string=cmd_string,
253+
os_search_str=os_search_str,
254+
operating_system=operating_system,
255+
)
303256
elif model == "gpt-4-with-ocr":
304257
prompt = SYSTEM_PROMPT_OCR.format(
305258
objective=objective,

0 commit comments

Comments
 (0)