Skip to content

Commit fdfc6b9

Browse files
authored
Merge pull request #52 from younesbram/voice
Voice functionality added with Whisper
2 parents 1531d07 + ef5921f commit fdfc6b9

File tree

3 files changed

+113
-46
lines changed

3 files changed

+113
-46
lines changed

README.md

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,20 @@
1111
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/self-operating-computer.png" width="750" style="margin: 10px;"/>
1212
</div>
1313

14-
### Key Features
14+
## Key Features
1515
- **Compatibility**: Designed for various multimodal models.
1616
- **Integration**: Currently integrated with **GPT-4v** as the default model.
1717
- **Future Plans**: Support for additional models.
18+
- **Accessibility**: Voice control thanks to [Whisper](https://github.com/mallorbc/whisper_mic) & [younesbram](https://github.com/younesbram)
1819

19-
### Current Challenges
20+
21+
## Current Challenges
2022
> **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
2123
22-
### Ongoing Development
24+
## Ongoing Development
2325
At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.
2426

25-
### Agent-1-Vision Model API Access
27+
## Agent-1-Vision Model API Access
2628
We will soon be offering API access to our Agent-1-Vision model.
2729

2830
If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).
@@ -89,26 +91,49 @@ operate
8991
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png" width="300" style="margin: 10px;"/>
9092
</div>
9193

92-
### Contributions are Welcomed!:
94+
## Using `operate` Modes
95+
96+
### Voice Mode
97+
- Install the additional `requirements-audio.txt`
98+
```
99+
pip install -r requirements-audio.txt
100+
```
101+
**Install device requirements**
102+
- For mac users:
103+
```
104+
brew install portaudio
105+
```
106+
- For Linux users:
107+
```
108+
sudo apt install portaudio19-dev python3-pyaudio
109+
```
110+
Run with voice mode
111+
```
112+
operate --voice
113+
114+
```
115+
116+
117+
## Contributions are Welcomed!:
93118

94119
If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).
95120

96-
### Feedback
121+
## Feedback
97122

98123
For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter.
99124

100-
### Join Our Discord Community
125+
## Join Our Discord Community
101126

102127
For real-time discussions and community support, join our Discord server.
103128
- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
104129
- If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
105130

106-
### Follow HyperWriteAI for More Updates
131+
## Follow HyperWriteAI for More Updates
107132

108133
Stay updated with the latest developments:
109134
- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).
110135
- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).
111136

112-
### Compatibility
137+
## Compatibility
113138
- This project is compatible with Mac OS, Windows, and Linux (with X server installed).
114139

operate/main.py

Lines changed: 78 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import platform
1414
import Xlib.display
1515
import Xlib.X
16-
import Xlib.Xutil # not sure if Xutil is necessary
16+
import Xlib.Xutil # not sure if Xutil is necessary
1717

1818
from prompt_toolkit import prompt
1919
from prompt_toolkit.shortcuts import message_dialog
@@ -23,6 +23,7 @@
2323
import matplotlib.font_manager as fm
2424
from openai import OpenAI
2525
import sys
26+
from whisper_mic import WhisperMic
2627

2728

2829
load_dotenv()
@@ -96,7 +97,9 @@
9697
Objective: {objective}
9798
"""
9899

99-
ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100+
ACCURATE_PIXEL_COUNT = (
101+
200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
102+
)
100103
ACCURATE_MODE_VISION_PROMPT = """
101104
It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102105
As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -192,10 +195,12 @@ def supports_ansi():
192195
ANSI_BRIGHT_MAGENTA = ""
193196

194197

195-
def main(model, accurate_mode):
198+
def main(model, accurate_mode, voice_mode=False):
196199
"""
197200
Main function for the Self-Operating Computer
198201
"""
202+
# Initialize WhisperMic if voice_mode is True if voice_mode is True
203+
mic = WhisperMic() if voice_mode else None if voice_mode else None
199204

200205
message_dialog(
201206
title="Self-Operating Computer",
@@ -204,18 +209,25 @@ def main(model, accurate_mode):
204209
).run()
205210

206211
print("SYSTEM", platform.system())
207-
212+
# Clear the console
208213
if platform.system() == "Windows":
209214
os.system("cls")
210215
else:
211216
print("\033c", end="")
212217

213-
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
214-
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
215-
216-
objective = prompt(
217-
style=style,
218-
)
218+
if voice_mode:
219+
print(
220+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
221+
)
222+
try:
223+
objective = mic.listen()
224+
except Exception as e:
225+
print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
226+
return # Exit if voice input fails
227+
else:
228+
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
229+
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
230+
objective = prompt(style=style)
219231

220232
assistant_message = {"role": "assistant", "content": USER_QUESTION}
221233
user_message = {
@@ -316,9 +328,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
316328
"""
317329
Format the accurate mode vision prompt
318330
"""
319-
width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
320-
height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
321-
prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
331+
width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
332+
height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
333+
prompt = ACCURATE_MODE_VISION_PROMPT.format(
334+
prev_x=prev_x, prev_y=prev_y, width=width, height=height
335+
)
322336
return prompt
323337

324338

@@ -345,15 +359,16 @@ def get_last_assistant_message(messages):
345359
return messages[index]
346360
return None # Return None if no assistant message is found
347361

362+
348363
def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
349364
"""
350-
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
365+
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351366
"""
352367
try:
353-
screenshot_filename = os.path.join(
354-
"screenshots", "screenshot_mini.png"
368+
screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
369+
capture_mini_screenshot_with_cursor(
370+
file_path=screenshot_filename, x=prev_x, y=prev_y
355371
)
356-
capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
357372

358373
new_screenshot_filename = os.path.join(
359374
"screenshots", "screenshot_mini_with_grid.png"
@@ -467,7 +482,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
467482
prev_y = click_data_json["y"]
468483

469484
if DEBUG:
470-
print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
485+
print(
486+
f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
487+
)
471488
content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
472489
assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
473490

@@ -541,6 +558,7 @@ def summarize(messages, objective):
541558
print(f"Error parsing JSON: {e}")
542559
return "Failed to summarize the workflow"
543560

561+
544562
def mouse_click(click_detail):
545563
try:
546564
x = convert_percent_to_decimal(click_detail["x"])
@@ -670,36 +688,48 @@ def search(text):
670688
return "Open program: " + text
671689

672690

673-
def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
691+
def capture_mini_screenshot_with_cursor(
692+
file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
693+
):
674694
user_platform = platform.system()
675695

676696
if user_platform == "Linux":
677-
x = float(x[:-1]) # convert x from "50%" to 50.
697+
x = float(x[:-1]) # convert x from "50%" to 50.
678698
y = float(y[:-1])
679699

680-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
681-
y = (y/100) * monitor_size['height']
700+
x = (x / 100) * monitor_size[
701+
"width"
702+
] # convert x from 50 to 0.5 * monitor_width
703+
y = (y / 100) * monitor_size["height"]
682704

683705
# Define the coordinates for the rectangle
684-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
685-
x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
706+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
707+
x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
686708

687709
screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
688-
screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
689-
screenshot.save(file_path)
710+
screenshot = screenshot.resize(
711+
(screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
712+
) # upscale the image so it's easier to see and percentage marks more visible
713+
screenshot.save(file_path)
690714

691715
screenshots_dir = "screenshots"
692-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
716+
grid_screenshot_filename = os.path.join(
717+
screenshots_dir, "screenshot_mini_with_grid.png"
718+
)
693719

694-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
720+
add_grid_to_image(
721+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
722+
)
695723
elif user_platform == "Darwin":
696-
x = float(x[:-1]) # convert x from "50%" to 50.
724+
x = float(x[:-1]) # convert x from "50%" to 50.
697725
y = float(y[:-1])
698726

699-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
700-
y = (y/100) * monitor_size['height']
727+
x = (x / 100) * monitor_size[
728+
"width"
729+
] # convert x from 50 to 0.5 * monitor_width
730+
y = (y / 100) * monitor_size["height"]
701731

702-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
732+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
703733

704734
width = ACCURATE_PIXEL_COUNT
705735
height = ACCURATE_PIXEL_COUNT
@@ -708,13 +738,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
708738
subprocess.run(["screencapture", "-C", rect, file_path])
709739

710740
screenshots_dir = "screenshots"
711-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
741+
grid_screenshot_filename = os.path.join(
742+
screenshots_dir, "screenshot_mini_with_grid.png"
743+
)
712744

713-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
745+
add_grid_to_image(
746+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
747+
)
714748

715749

716750
def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
717-
file_path=os.path.join("screenshots", "screenshot.png")
751+
file_path = os.path.join("screenshots", "screenshot.png")
718752
user_platform = platform.system()
719753

720754
if user_platform == "Windows":
@@ -727,7 +761,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
727761
monitor_size["width"] = size[0]
728762
monitor_size["height"] = size[1]
729763
screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
730-
screenshot.save(file_path)
764+
screenshot.save(file_path)
731765
elif user_platform == "Darwin": # (Mac OS)
732766
# Use the screencapture utility to capture the screen with the cursor
733767
subprocess.run(["screencapture", "-C", file_path])
@@ -775,6 +809,13 @@ def main_entry():
775809
default="gpt-4-vision-preview",
776810
)
777811

812+
# Add a voice flag
813+
parser.add_argument(
814+
"--voice",
815+
help="Use voice input mode",
816+
action="store_true",
817+
)
818+
778819
parser.add_argument(
779820
"-accurate",
780821
help="Activate Reflective Mouse Click Mode",
@@ -784,7 +825,7 @@ def main_entry():
784825

785826
try:
786827
args = parser.parse_args()
787-
main(args.model, accurate_mode=args.accurate)
828+
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
788829
except KeyboardInterrupt:
789830
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
790831

requirements-audio.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
whisper-mic

0 commit comments

Comments
 (0)