Consolidate SYSTEM_PROMPT_LABELED_MAC and SYSTEM_PROMPT_LABELED_win

joshbickett · joshbickett · commit f6733826c1de · 2024-02-09T11:19:27.000-08:00
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -35,14 +35,15 @@
 # Load configuration
 config = Config()
 
+
 async def get_next_action(model, messages, objective, session_id):
     if config.verbose:
         print("[Self-Operating Computer][get_next_action]")
         print("[Self-Operating Computer][get_next_action] model", model)
     if model == "gpt-4":
         return call_gpt_4_vision_preview(messages), None
     if model == "gpt-4-with-som":
-        operation = await call_gpt_4_vision_preview_labeled(messages, objective)
+        operation = await call_gpt_4_vision_preview_labeled(messages, objective, model)
         return operation, None
     if model == "gpt-4-with-ocr":
         operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
@@ -328,12 +329,13 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
         return gpt_4_fallback(messages, objective, model)
 
 
-async def call_gpt_4_vision_preview_labeled(messages, objective):
+async def call_gpt_4_vision_preview_labeled(messages, objective, model):
     time.sleep(1)
-    client = config.initialize_openai()
 
-    # Construct the path to the file within the package
     try:
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
         file_path = pkg_resources.resource_filename("operate.models.weights", "best.pt")
         yolo_model = YOLO(file_path)  # Load your trained model
         screenshots_dir = "screenshots"
@@ -500,13 +502,13 @@ def call_ollama_llava(messages):
             model="llava",
             messages=messages,
         )
-        
+
         # Important: Remove the image path from the message history.
         # Ollama will attempt to load each image reference and will
         # eventually timeout.
         messages[-1]["images"] = None
-        
-        content = response['message']['content'].strip()
+
+        content = response["message"]["content"].strip()
 
         if content.startswith("```json"):
             content = content[len("```json") :]  # Remove starting ```json
@@ -530,7 +532,7 @@ def call_ollama_llava(messages):
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Operate] Couldn't connect to Ollama. With Ollama installed, run `ollama pull llava` then `ollama serve`{ANSI_RESET}",
             e,
         )
-        
+
     except Exception as e:
         print(
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
@@ -99,8 +99,8 @@
 """
 
 
-SYSTEM_PROMPT_LABELED_MAC = """
-You are operating a computer, using the same operating system as a human.
+SYSTEM_PROMPT_LABELED = """
+You are operating a {operating_system} computer, using the same operating system as a human.
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
@@ -124,13 +124,13 @@
 
 # Opens Spotlight Search on Mac 
 [
-    {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["command", "space"] }},
+    {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": {os_search_str} }},
     {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
 ]
 
 # Focuses on the address bar in a browser before typing a website
 [
-    {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["command", "l"] }},
+    {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
     {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
     {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
 ]
@@ -149,55 +149,6 @@
 Objective: {objective} 
 """
 
-SYSTEM_PROMPT_LABELED_WIN_LINUX = """
-You are operating a computer, using the same operating system as a human.
-
-From looking at the screen, the objective, and your previous actions, take the next best series of action. 
-
-You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
-
-1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
-[{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}]  # 'percent' refers to the percentage of the screen's dimensions in decimal format
-
-2. write - Write with your keyboard
-[{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
-
-3. press - Use a hotkey or press key to operate the computer
-[{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
-
-4. done - The objective is completed
-[{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
-
-Return the actions in array format `[]`. You can take just one action or multiple actions.
-
-Here are some helpful combinations:
-
-# Opens Menu Search on Windows and Linux 
-[
-    {{ "thought": "Searching the operating system to find Google Chrome because it appears I am currently in terminal", "operation": "press", "keys": ["win"] }},
-    {{ "thought": "Now I need to write 'Google Chrome' as a next step", "operation": "write", "content": "Google Chrome" }},
-]
-
-# Focuses on the address bar in a browser before typing a website
-[
-    {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": ["ctrl", "l"] }},
-    {{ "thought": "Now that the address bar is in focus I can type the URL", "operation": "write", "content": "https://news.ycombinator.com/" }},
-    {{ "thought": "I'll need to press enter to go the URL now", "operation": "press", "keys": ["enter"] }}
-]
-
-# Send a "Hello World" message in the chat
-[
-    {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }},
-    {{ "thought": "Now that I am focused on the message field, I'll go ahead and write ", "operation": "write", "content": "Hello World" }},
-]
-
-A few important notes: 
-
-- Go to Google Docs and Google Sheets by typing in the Chrome Address bar
-- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.
-
-Objective: {objective} 
-"""
 
 SYSTEM_PROMPT_OCR = """
 You are operating a {operating_system} computer, using the same operating system as a human.
@@ -296,10 +247,12 @@ def get_system_prompt(model, objective):
         operating_system = "Linux"
 
     if model == "gpt-4-with-som":
-        if platform.system() == "Darwin":
-            prompt = SYSTEM_PROMPT_LABELED_MAC.format(objective=objective)
-        else:
-            prompt = SYSTEM_PROMPT_LABELED_WIN_LINUX.format(objective=objective)
+        prompt = SYSTEM_PROMPT_LABELED.format(
+            objective=objective,
+            cmd_string=cmd_string,
+            os_search_str=os_search_str,
+            operating_system=operating_system,
+        )
     elif model == "gpt-4-with-ocr":
         prompt = SYSTEM_PROMPT_OCR.format(
             objective=objective,