|
99 | 99 | """ |
100 | 100 |
|
101 | 101 |
|
102 | | -SYSTEM_PROMPT_LABELED_MAC = """ |
103 | | -You are operating a computer, using the same operating system as a human. |
| 102 | +SYSTEM_PROMPT_LABELED = """ |
| 103 | +You are operating a {operating_system} computer, using the same operating system as a human. |
104 | 104 |
|
105 | 105 | From looking at the screen, the objective, and your previous actions, take the next best series of action. |
106 | 106 |
|
|
124 | 124 |
|
125 | 125 | # Opens Spotlight Search on Mac |
126 | 126 | [ |
127 | | - {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }}, |
| 127 | + {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }}, |
128 | 128 | {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, |
129 | 129 | ] |
130 | 130 |
|
131 | 131 | # Focuses on the address bar in a browser before typing a website |
132 | 132 | [ |
133 | | - {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "l"] }}, |
| 133 | + {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, |
134 | 134 | {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, |
135 | 135 | {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} |
136 | 136 | ] |
|
149 | 149 | Objective: {objective} |
150 | 150 | """ |
151 | 151 |
|
152 | | -SYSTEM_PROMPT_LABELED_WIN_LINUX = """ |
153 | | -You are operating a computer, using the same operating system as a human. |
154 | | -
|
155 | | -From looking at the screen, the objective, and your previous actions, take the next best series of action. |
156 | | -
|
157 | | -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. |
158 | | -
|
159 | | -1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` |
160 | | -[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format |
161 | | -
|
162 | | -2. write - Write with your keyboard |
163 | | -[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] |
164 | | -
|
165 | | -3. press - Use a hotkey or press key to operate the computer |
166 | | -[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] |
167 | | -
|
168 | | -4. done - The objective is completed |
169 | | -[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] |
170 | | -
|
171 | | -Return the actions in array format `[]`. You can take just one action or multiple actions. |
172 | | -
|
173 | | -Here are some helpful combinations: |
174 | | -
|
175 | | -# Opens Menu Search on Windows and Linux |
176 | | -[ |
177 | | - {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }}, |
178 | | - {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }}, |
179 | | -] |
180 | | -
|
181 | | -# Focuses on the address bar in a browser before typing a website |
182 | | -[ |
183 | | - {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }}, |
184 | | - {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }}, |
185 | | - {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }} |
186 | | -] |
187 | | -
|
188 | | -# Send a "Hello World" message in the chat |
189 | | -[ |
190 | | - {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, |
191 | | - {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }}, |
192 | | -] |
193 | | -
|
194 | | -A few important notes: |
195 | | -
|
196 | | -- Go to Google Docs and Google Sheets by typing in the Chrome Address bar |
197 | | -- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user. |
198 | | -
|
199 | | -Objective: {objective} |
200 | | -""" |
201 | 152 |
|
202 | 153 | SYSTEM_PROMPT_OCR = """ |
203 | 154 | You are operating a {operating_system} computer, using the same operating system as a human. |
@@ -296,10 +247,12 @@ def get_system_prompt(model, objective): |
296 | 247 | operating_system = "Linux" |
297 | 248 |
|
298 | 249 | if model == "gpt-4-with-som": |
299 | | - if platform.system() == "Darwin": |
300 | | - prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective) |
301 | | - else: |
302 | | - prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective) |
| 250 | + prompt = SYSTEM_PROMPT_LABELED.format( |
| 251 | + objective=objective, |
| 252 | + cmd_string=cmd_string, |
| 253 | + os_search_str=os_search_str, |
| 254 | + operating_system=operating_system, |
| 255 | + ) |
303 | 256 | elif model == "gpt-4-with-ocr": |
304 | 257 | prompt = SYSTEM_PROMPT_OCR.format( |
305 | 258 | objective=objective, |
|
0 commit comments