Merge pull request #52 from younesbram/voice

joshbickett · web-flow · commit fdfc6b93ee6c · 2023-12-05T06:15:44.000-08:00
Voice functionality added with Whisper
diff --git a/README.md b/README.md
@@ -11,18 +11,20 @@
   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/self-operating-computer.png" width="750"  style="margin: 10px;"/>
 </div>
 
-### Key Features
+## Key Features
 - **Compatibility**: Designed for various multimodal models.
 - **Integration**: Currently integrated with **GPT-4v** as the default model.
 - **Future Plans**: Support for additional models.
+- **Accessibility**: Voice control thanks to [Whisper](https://github.com/mallorbc/whisper_mic) & [younesbram](https://github.com/younesbram)
 
-### Current Challenges
+
+## Current Challenges
 > **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
 
-### Ongoing Development
+## Ongoing Development
 At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.
 
-### Agent-1-Vision Model API Access
+## Agent-1-Vision Model API Access
 We will soon be offering API access to our Agent-1-Vision model.
 
 If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).
@@ -89,26 +91,49 @@ operate
   <img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png" width="300"  style="margin: 10px;"/>
 </div>
 
-### Contributions are Welcomed!:
+## Using `operate` Modes
+
+### Voice Mode
+- Install the additional `requirements-audio.txt`
+```
+pip install -r requirements-audio.txt
+```
+**Install device requirements**
+- For mac users:
+```
+brew install portaudio
+```
+- For Linux users:
+```
+sudo apt install portaudio19-dev python3-pyaudio
+```
+Run with voice mode
+```
+operate --voice
+
+```
+
+
+## Contributions are Welcomed!:
 
 If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).
 
-### Feedback
+## Feedback
 
 For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. 
 
-### Join Our Discord Community
+## Join Our Discord Community
 
 For real-time discussions and community support, join our Discord server. 
 - If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
 - If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
 
-### Follow HyperWriteAI for More Updates
+## Follow HyperWriteAI for More Updates
 
 Stay updated with the latest developments:
 - Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).
 - Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).
 
-### Compatibility
+## Compatibility
 - This project is compatible with Mac OS, Windows, and Linux (with X server installed).
 
diff --git a/operate/main.py b/operate/main.py
@@ -13,7 +13,7 @@
 import platform
 import Xlib.display
 import Xlib.X
-import Xlib.Xutil # not sure if Xutil is necessary
+import Xlib.Xutil  # not sure if Xutil is necessary
 
 from prompt_toolkit import prompt
 from prompt_toolkit.shortcuts import message_dialog
@@ -23,6 +23,7 @@
 import matplotlib.font_manager as fm
 from openai import OpenAI
 import sys
+from whisper_mic import WhisperMic
 
 
 load_dotenv()
@@ -96,7 +97,9 @@
 Objective: {objective}
 """
 
-ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
+ACCURATE_PIXEL_COUNT = (
+    200  # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
+)
 ACCURATE_MODE_VISION_PROMPT = """
 It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
 As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. 
@@ -192,10 +195,12 @@ def supports_ansi():
     ANSI_BRIGHT_MAGENTA = ""
 
 
-def main(model, accurate_mode):
+def main(model, accurate_mode, voice_mode=False):
     """
     Main function for the Self-Operating Computer
     """
+    # Initialize WhisperMic if voice_mode is True if voice_mode is True
+    mic = WhisperMic() if voice_mode else None if voice_mode else None
 
     message_dialog(
         title="Self-Operating Computer",
@@ -204,18 +209,25 @@ def main(model, accurate_mode):
     ).run()
 
     print("SYSTEM", platform.system())
-
+    # Clear the console
     if platform.system() == "Windows":
         os.system("cls")
     else:
         print("\033c", end="")
 
-    print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
-    print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
-
-    objective = prompt(
-        style=style,
-    )
+    if voice_mode:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
+        )
+        try:
+            objective = mic.listen()
+        except Exception as e:
+            print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
+            return  # Exit if voice input fails
+    else:
+        print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
+        print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
+        objective = prompt(style=style)
 
     assistant_message = {"role": "assistant", "content": USER_QUESTION}
     user_message = {
@@ -316,9 +328,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
     """
     Format the accurate mode vision prompt
     """
-    width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
-    height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
-    prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
+    width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
+    height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
+    prompt = ACCURATE_MODE_VISION_PROMPT.format(
+        prev_x=prev_x, prev_y=prev_y, width=width, height=height
+    )
     return prompt
 
 
@@ -345,15 +359,16 @@ def get_last_assistant_message(messages):
                 return messages[index]
     return None  # Return None if no assistant message is found
 
+
 def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
     """
-    Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location 
+    Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
     """
     try:
-        screenshot_filename = os.path.join(
-            "screenshots", "screenshot_mini.png"
+        screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
+        capture_mini_screenshot_with_cursor(
+            file_path=screenshot_filename, x=prev_x, y=prev_y
         )
-        capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
 
         new_screenshot_filename = os.path.join(
             "screenshots", "screenshot_mini_with_grid.png"
@@ -467,7 +482,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
                 prev_y = click_data_json["y"]
 
                 if DEBUG:
-                    print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
+                    print(
+                        f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
+                    )
                 content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
                 assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
 
@@ -541,6 +558,7 @@ def summarize(messages, objective):
         print(f"Error parsing JSON: {e}")
         return "Failed to summarize the workflow"
 
+
 def mouse_click(click_detail):
     try:
         x = convert_percent_to_decimal(click_detail["x"])
@@ -670,36 +688,48 @@ def search(text):
     return "Open program: " + text
 
 
-def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
+def capture_mini_screenshot_with_cursor(
+    file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
+):
     user_platform = platform.system()
 
     if user_platform == "Linux":
-        x = float(x[:-1]) # convert x from "50%" to 50.
+        x = float(x[:-1])  # convert x from "50%" to 50.
         y = float(y[:-1])
 
-        x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
-        y = (y/100) * monitor_size['height']
+        x = (x / 100) * monitor_size[
+            "width"
+        ]  # convert x from 50 to 0.5 * monitor_width
+        y = (y / 100) * monitor_size["height"]
 
         # Define the coordinates for the rectangle
-        x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
-        x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
+        x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
+        x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
 
         screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
-        screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
-        screenshot.save(file_path)            
+        screenshot = screenshot.resize(
+            (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
+        )  # upscale the image so it's easier to see and percentage marks more visible
+        screenshot.save(file_path)
 
         screenshots_dir = "screenshots"
-        grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
+        grid_screenshot_filename = os.path.join(
+            screenshots_dir, "screenshot_mini_with_grid.png"
+        )
 
-        add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
+        add_grid_to_image(
+            file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
+        )
     elif user_platform == "Darwin":
-        x = float(x[:-1]) # convert x from "50%" to 50.
+        x = float(x[:-1])  # convert x from "50%" to 50.
         y = float(y[:-1])
 
-        x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
-        y = (y/100) * monitor_size['height']
+        x = (x / 100) * monitor_size[
+            "width"
+        ]  # convert x from 50 to 0.5 * monitor_width
+        y = (y / 100) * monitor_size["height"]
 
-        x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
+        x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
 
         width = ACCURATE_PIXEL_COUNT
         height = ACCURATE_PIXEL_COUNT
@@ -708,13 +738,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
         subprocess.run(["screencapture", "-C", rect, file_path])
 
         screenshots_dir = "screenshots"
-        grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
+        grid_screenshot_filename = os.path.join(
+            screenshots_dir, "screenshot_mini_with_grid.png"
+        )
 
-        add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
+        add_grid_to_image(
+            file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
+        )
 
 
 def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
-    file_path=os.path.join("screenshots", "screenshot.png")
+    file_path = os.path.join("screenshots", "screenshot.png")
     user_platform = platform.system()
 
     if user_platform == "Windows":
@@ -727,7 +761,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
         monitor_size["width"] = size[0]
         monitor_size["height"] = size[1]
         screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
-        screenshot.save(file_path)            
+        screenshot.save(file_path)
     elif user_platform == "Darwin":  # (Mac OS)
         # Use the screencapture utility to capture the screen with the cursor
         subprocess.run(["screencapture", "-C", file_path])
@@ -775,6 +809,13 @@ def main_entry():
         default="gpt-4-vision-preview",
     )
 
+    # Add a voice flag
+    parser.add_argument(
+        "--voice",
+        help="Use voice input mode",
+        action="store_true",
+    )
+
     parser.add_argument(
         "-accurate",
         help="Activate Reflective Mouse Click Mode",
@@ -784,7 +825,7 @@ def main_entry():
 
     try:
         args = parser.parse_args()
-        main(args.model, accurate_mode=args.accurate)
+        main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
     except KeyboardInterrupt:
         print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
 
diff --git a/requirements-audio.txt b/requirements-audio.txt
@@ -0,0 +1 @@
+whisper-mic