Skip to content

Commit f2a8147

Browse files
committed
updating usability of voice feauture and installation
1 parent 51d9993 commit f2a8147

File tree

4 files changed

+44
-9
lines changed

4 files changed

+44
-9
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
- **Compatibility**: Designed for various multimodal models.
1616
- **Integration**: Currently integrated with **GPT-4v** as the default model.
1717
- **Future Plans**: Support for additional models.
18+
- **Accessibility**: Voice control thanks to [Whisper](https://github.com/mallorbc/whisper_mic) & [younesbram](https://github.com/younesbram)
19+
1820

1921
### Current Challenges
2022
> **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
@@ -66,6 +68,15 @@ source venv/bin/activate
6668
```
6769
pip install -r requirements.txt
6870
```
71+
5.1 **Optional installs for voice control**:
72+
```
73+
pip install -r requirements-audio.txt
74+
75+
For mac users:
76+
brew install portaudio
77+
For Linux users:
78+
sudo apt install portaudio19-dev python3-pyaudio
79+
```
6980
6. **Install Project and Command-Line Interface**:
7081
```
7182
pip install .

operate/main.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import matplotlib.font_manager as fm
2424
from openai import OpenAI
2525
import sys
26+
from whisper_mic import WhisperMic
2627

2728

2829
load_dotenv()
@@ -192,10 +193,12 @@ def supports_ansi():
192193
ANSI_BRIGHT_MAGENTA = ""
193194

194195

195-
def main(model, accurate_mode):
196+
def main(model, accurate_mode, voice_mode=False, voice_mode=False):
196197
"""
197198
Main function for the Self-Operating Computer
198199
"""
200+
# Initialize WhisperMic if voice_mode is True if voice_mode is True
201+
mic = WhisperMic() if voice_mode else None if voice_mode else None
199202

200203
message_dialog(
201204
title="Self-Operating Computer",
@@ -204,18 +207,23 @@ def main(model, accurate_mode):
204207
).run()
205208

206209
print("SYSTEM", platform.system())
207-
210+
# Clear the console
208211
if platform.system() == "Windows":
209212
os.system("cls")
210213
else:
211214
print("\033c", end="")
212215

213-
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
214-
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
215-
216-
objective = prompt(
217-
style=style,
218-
)
216+
if voice_mode:
217+
print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)")
218+
try:
219+
objective = mic.listen()
220+
except Exception as e:
221+
print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
222+
return # Exit if voice input fails
223+
else:
224+
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
225+
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
226+
objective = prompt(style=style)
219227

220228
assistant_message = {"role": "assistant", "content": USER_QUESTION}
221229
user_message = {
@@ -775,19 +783,33 @@ def main_entry():
775783
default="gpt-4-vision-preview",
776784
)
777785

786+
# Add a voice flag
787+
parser.add_argument(
788+
"--voice",
789+
help="Use voice input mode",
790+
action="store_true",
791+
)
792+
778793
parser.add_argument(
779794
"-accurate",
780795
help="Activate Reflective Mouse Click Mode",
781796
action="store_true",
782797
required=False,
783798
)
799+
# Add a voice flag
800+
parser.add_argument(
801+
"--voice",
802+
help="Use voice input mode",
803+
action="store_true",
804+
)
784805

785806
try:
786807
args = parser.parse_args()
787-
main(args.model, accurate_mode=args.accurate)
808+
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice, voice_mode=args.voice)
788809
except KeyboardInterrupt:
789810
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
790811

791812

813+
792814
if __name__ == "__main__":
793815
main_entry()

requirements-audio.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
whisper-mic

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ charset-normalizer==3.3.2
55
colorama==0.4.6
66
contourpy==1.2.0
77
cycler==0.12.1
8+
whisper-mic
89
distro==1.8.0
910
EasyProcess==1.1
1011
entrypoint2==1.1

0 commit comments

Comments
 (0)