Skip to content

Commit b9b7786

Browse files
committed
Add cmd_string, etc. to format strings and reduce prompts
1 parent 30170a8 commit b9b7786

File tree

1 file changed

+25
-101
lines changed

1 file changed

+25
-101
lines changed

operate/models/prompts.py

Lines changed: 25 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,7 @@
199199
Objective: {objective}
200200
"""
201201

202-
203-
SYSTEM_PROMPT_OCR_MAC = """
202+
SYSTEM_PROMPT_OCR = """
204203
You are operating a computer, using the same operating system as a human.
205204
206205
From looking at the screen, the objective, and your previous actions, take the next best series of action.
@@ -231,7 +230,7 @@
231230
Example 1: Opens Spotlight Search on Mac and open Google Chrome
232231
```
233232
[
234-
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }},
233+
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
235234
{{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
236235
{{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
237236
]
@@ -240,7 +239,7 @@
240239
Example 2: Open a new Google Docs when the browser is already open
241240
```
242241
[
243-
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "t"] }},
242+
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "t"] }},
244243
{{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://docs.new/" }},
245244
{{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
246245
]
@@ -266,73 +265,6 @@
266265
Objective: {objective}
267266
"""
268267

269-
SYSTEM_PROMPT_OCR_WIN_LINUX = """
270-
You are operating a computer, using the same operating system as a human.
271-
272-
From looking at the screen, the objective, and your previous actions, take the next best series of action.
273-
274-
You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
275-
276-
1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method.
277-
```
278-
[{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}]
279-
```
280-
2. write - Write with your keyboard
281-
```
282-
[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
283-
```
284-
3. press - Use a hotkey or press key to operate the computer
285-
```
286-
[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
287-
```
288-
4. done - The objective is completed
289-
```
290-
[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
291-
```
292-
293-
Return the actions in array format `[]`. You can take just one action or multiple actions.
294-
295-
Here a helpful example:
296-
297-
Example 1: Opens Spotlight Search on Mac and see if Google Chrome is available to use
298-
```
299-
[
300-
{{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }},
301-
{{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
302-
{{ "thought": "Finally I'll press enter to open Google Chrome assuming it is available", "operation": "press", "keys": ["enter"] }}
303-
]
304-
```
305-
306-
Example 2: Open a new Google Docs when the browser is already open
307-
```
308-
[
309-
{{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "t"] }},
310-
{{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://docs.new/" }},
311-
{{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
312-
]
313-
```
314-
315-
Example 3: Search for someone on Linkedin when already on linkedin.com
316-
```
317-
[
318-
{{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
319-
{{ "thought": "Now that the field is active I can write the name of the person I'd like to search for", "operation": "write", "content": "John Doe" }},
320-
{{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }}
321-
]
322-
```
323-
324-
325-
A few important notes:
326-
327-
- Default to Google Chrome as the browser
328-
- Go to websites by opening a new tab with `press` and then `write` the URL
329-
- Reflect on previous actions and the screenshot to ensure they align and that your previous actions worked
330-
- If the first time clicking a button or link doesn't work, don't try again to click it. Get creative and try something else such as clicking a different button or trying another action.
331-
- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
332-
333-
Objective: {objective}
334-
"""
335-
336268
OPERATE_FIRST_MESSAGE_PROMPT = """
337269
Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
338270
@@ -350,40 +282,32 @@ def get_system_prompt(model, objective):
350282
Format the vision prompt more efficiently and print the name of the prompt used
351283
"""
352284

353-
prompt_map = {
354-
("gpt-4-with-som", "Darwin"): (
355-
SYSTEM_PROMPT_LABELED_MAC,
356-
"SYSTEM_PROMPT_LABELED_MAC",
357-
),
358-
("gpt-4-with-som", "Other"): (
359-
SYSTEM_PROMPT_LABELED_WIN_LINUX,
360-
"SYSTEM_PROMPT_LABELED_WIN_LINUX",
361-
),
362-
("gpt-4-with-ocr", "Darwin"): (SYSTEM_PROMPT_OCR_MAC, "SYSTEM_PROMPT_OCR_MAC"),
363-
("gpt-4-with-ocr", "Other"): (
364-
SYSTEM_PROMPT_OCR_WIN_LINUX,
365-
"SYSTEM_PROMPT_OCR_WIN_LINUX",
366-
),
367-
("default", "Darwin"): (SYSTEM_PROMPT_MAC, "SYSTEM_PROMPT_MAC"),
368-
("default", "Other"): (SYSTEM_PROMPT_WIN_LINUX, "SYSTEM_PROMPT_WIN_LINUX"),
369-
}
370-
371-
os_type = "Darwin" if platform.system() == "Darwin" else "Other"
372-
373-
# Fetching the prompt tuple (string and name) based on the model and OS
374-
prompt_tuple = prompt_map.get((model, os_type), prompt_map[("default", os_type)])
375-
376-
# Extracting the prompt string and its name
377-
prompt_string, prompt_name = prompt_tuple
378-
379-
# Formatting the prompt
380-
prompt = prompt_string.format(objective=objective)
285+
if platform.system() == "Darwin":
286+
cmd_string = "command"
287+
os_search_str = ["command", "space"]
288+
else:
289+
cmd_string = "ctrl"
290+
os_search_str = ["win"]
291+
292+
if model == "gpt-4-with-som":
293+
if platform.system() == "Darwin":
294+
prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
295+
else:
296+
prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
297+
elif model == "gpt-4-with-ocr":
298+
prompt = SYSTEM_PROMPT_OCR.format(
299+
objective=objective, cmd_string=cmd_string, os_search_str=os_search_str
300+
)
301+
else:
302+
if platform.system() == "Darwin":
303+
prompt = SYSTEM_PROMPT_MAC.format(objective=objective)
304+
else:
305+
prompt = SYSTEM_PROMPT_WIN_LINUX.format(objective=objective)
381306

382307
# Optional verbose output
383308
if config.verbose:
384309
print("[get_system_prompt] model:", model)
385-
print("[get_system_prompt] prompt name:", prompt_name)
386-
# print("[get_system_prompt] prompt:", prompt)
310+
print("\n\n\n\n[get_system_prompt] prompt:", prompt, "\n\n\n\n")
387311

388312
return prompt
389313

0 commit comments

Comments
 (0)