Merge branch 'voice-fixes'

joshbickett · joshbickett · commit d687e14ccad5 · 2023-12-05T07:13:29.000-08:00
diff --git a/operate/main.py b/operate/main.py
@@ -13,7 +13,7 @@
 import platform
 import Xlib.display
 import Xlib.X
-import Xlib.Xutil # not sure if Xutil is necessary
+import Xlib.Xutil  # not sure if Xutil is necessary
 
 from prompt_toolkit import prompt
 from prompt_toolkit.shortcuts import message_dialog
@@ -96,7 +96,9 @@
 Objective: {objective}
 """
 
-ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
+ACCURATE_PIXEL_COUNT = (
+    200  # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
+)
 ACCURATE_MODE_VISION_PROMPT = """
 It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
 As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. 
@@ -196,6 +198,22 @@ def main(model, accurate_mode):
     """
     Main function for the Self-Operating Computer
     """
+    mic = None
+    # Initialize WhisperMic if voice_mode is True if voice_mode is True
+    """
+    Main function for the Self-Operating Computer
+    """
+    if voice_mode:
+        try:
+            from whisper_mic import WhisperMic
+
+            # Initialize WhisperMic if import is successful
+            mic = WhisperMic()
+        except ImportError:
+            print(
+                "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
+            )
+            sys.exit(1)
 
     message_dialog(
         title="Self-Operating Computer",
@@ -316,9 +334,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
     """
     Format the accurate mode vision prompt
     """
-    width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
-    height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
-    prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
+    width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
+    height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
+    prompt = ACCURATE_MODE_VISION_PROMPT.format(
+        prev_x=prev_x, prev_y=prev_y, width=width, height=height
+    )
     return prompt
 
 
@@ -345,15 +365,16 @@ def get_last_assistant_message(messages):
                 return messages[index]
     return None  # Return None if no assistant message is found
 
+
 def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
     """
-    Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location 
+    Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
     """
     try:
-        screenshot_filename = os.path.join(
-            "screenshots", "screenshot_mini.png"
+        screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
+        capture_mini_screenshot_with_cursor(
+            file_path=screenshot_filename, x=prev_x, y=prev_y
         )
-        capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
 
         new_screenshot_filename = os.path.join(
             "screenshots", "screenshot_mini_with_grid.png"
@@ -467,7 +488,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
                 prev_y = click_data_json["y"]
 
                 if DEBUG:
-                    print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
+                    print(
+                        f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
+                    )
                 content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
                 assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
 
@@ -541,6 +564,7 @@ def summarize(messages, objective):
         print(f"Error parsing JSON: {e}")
         return "Failed to summarize the workflow"
 
+
 def mouse_click(click_detail):
     try:
         x = convert_percent_to_decimal(click_detail["x"])
@@ -670,36 +694,48 @@ def search(text):
     return "Open program: " + text
 
 
-def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
+def capture_mini_screenshot_with_cursor(
+    file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
+):
     user_platform = platform.system()
 
     if user_platform == "Linux":
-        x = float(x[:-1]) # convert x from "50%" to 50.
+        x = float(x[:-1])  # convert x from "50%" to 50.
         y = float(y[:-1])
 
-        x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
-        y = (y/100) * monitor_size['height']
+        x = (x / 100) * monitor_size[
+            "width"
+        ]  # convert x from 50 to 0.5 * monitor_width
+        y = (y / 100) * monitor_size["height"]
 
         # Define the coordinates for the rectangle
-        x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
-        x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
+        x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
+        x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
 
         screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
-        screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
-        screenshot.save(file_path)            
+        screenshot = screenshot.resize(
+            (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
+        )  # upscale the image so it's easier to see and percentage marks more visible
+        screenshot.save(file_path)
 
         screenshots_dir = "screenshots"
-        grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
+        grid_screenshot_filename = os.path.join(
+            screenshots_dir, "screenshot_mini_with_grid.png"
+        )
 
-        add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
+        add_grid_to_image(
+            file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
+        )
     elif user_platform == "Darwin":
-        x = float(x[:-1]) # convert x from "50%" to 50.
+        x = float(x[:-1])  # convert x from "50%" to 50.
         y = float(y[:-1])
 
-        x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
-        y = (y/100) * monitor_size['height']
+        x = (x / 100) * monitor_size[
+            "width"
+        ]  # convert x from 50 to 0.5 * monitor_width
+        y = (y / 100) * monitor_size["height"]
 
-        x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
+        x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
 
         width = ACCURATE_PIXEL_COUNT
         height = ACCURATE_PIXEL_COUNT
@@ -708,13 +744,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
         subprocess.run(["screencapture", "-C", rect, file_path])
 
         screenshots_dir = "screenshots"
-        grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
+        grid_screenshot_filename = os.path.join(
+            screenshots_dir, "screenshot_mini_with_grid.png"
+        )
 
-        add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
+        add_grid_to_image(
+            file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
+        )
 
 
 def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
-    file_path=os.path.join("screenshots", "screenshot.png")
+    file_path = os.path.join("screenshots", "screenshot.png")
     user_platform = platform.system()
 
     if user_platform == "Windows":
@@ -727,7 +767,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
         monitor_size["width"] = size[0]
         monitor_size["height"] = size[1]
         screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
-        screenshot.save(file_path)            
+        screenshot.save(file_path)
     elif user_platform == "Darwin":  # (Mac OS)
         # Use the screencapture utility to capture the screen with the cursor
         subprocess.run(["screencapture", "-C", file_path])