updating usability of voice feauture and installation

younesbram · younesbram · commit f2a81470ac1f · 2023-12-03T12:24:43.000-05:00
diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@
 - **Compatibility**: Designed for various multimodal models.
 - **Integration**: Currently integrated with **GPT-4v** as the default model.
 - **Future Plans**: Support for additional models.
+- **Accessibility**: Voice control thanks to [Whisper](https://github.com/mallorbc/whisper_mic) & [younesbram](https://github.com/younesbram)
+
 
 ### Current Challenges
 > **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
@@ -66,6 +68,15 @@ source venv/bin/activate
 ```
 pip install -r requirements.txt
 ```
+5.1 **Optional installs for voice control**:
+```
+pip install -r requirements-audio.txt
+
+For mac users:
+brew install portaudio
+For Linux users:
+sudo apt install portaudio19-dev python3-pyaudio
+```
 6. **Install Project and Command-Line Interface**:
 ```
 pip install .
diff --git a/operate/main.py b/operate/main.py
@@ -23,6 +23,7 @@
 import matplotlib.font_manager as fm
 from openai import OpenAI
 import sys
+from whisper_mic import WhisperMic
 
 
 load_dotenv()
@@ -192,10 +193,12 @@ def supports_ansi():
     ANSI_BRIGHT_MAGENTA = ""
 
 
-def main(model, accurate_mode):
+def main(model, accurate_mode, voice_mode=False, voice_mode=False):
     """
     Main function for the Self-Operating Computer
     """
+    # Initialize WhisperMic if voice_mode is True if voice_mode is True
+    mic = WhisperMic() if voice_mode else None  if voice_mode else None
 
     message_dialog(
         title="Self-Operating Computer",
@@ -204,18 +207,23 @@ def main(model, accurate_mode):
     ).run()
 
     print("SYSTEM", platform.system())
-
+    # Clear the console
     if platform.system() == "Windows":
         os.system("cls")
     else:
         print("\033c", end="")
 
-    print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
-    print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
-
-    objective = prompt(
-        style=style,
-    )
+    if voice_mode:
+        print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)")
+        try:
+            objective = mic.listen()
+        except Exception as e:
+            print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
+            return  # Exit if voice input fails
+    else:
+        print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
+        print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
+        objective = prompt(style=style)
 
     assistant_message = {"role": "assistant", "content": USER_QUESTION}
     user_message = {
@@ -775,19 +783,33 @@ def main_entry():
         default="gpt-4-vision-preview",
     )
 
+    # Add a voice flag
+    parser.add_argument(
+        "--voice",
+        help="Use voice input mode",
+        action="store_true",
+    )
+
     parser.add_argument(
         "-accurate",
         help="Activate Reflective Mouse Click Mode",
         action="store_true",
         required=False,
     )
+    # Add a voice flag
+    parser.add_argument(
+        "--voice",
+        help="Use voice input mode",
+        action="store_true",
+    )
 
     try:
         args = parser.parse_args()
-        main(args.model, accurate_mode=args.accurate)
+        main(args.model, accurate_mode=args.accurate, voice_mode=args.voice, voice_mode=args.voice)
     except KeyboardInterrupt:
         print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
 
 
+
 if __name__ == "__main__":
     main_entry()
diff --git a/requirements-audio.txt b/requirements-audio.txt
@@ -0,0 +1 @@
+whisper-mic
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ charset-normalizer==3.3.2
 colorama==0.4.6
 contourpy==1.2.0
 cycler==0.12.1
+whisper-mic
 distro==1.8.0
 EasyProcess==1.1
 entrypoint2==1.1