Skip to content

Commit 0d6c5c2

Browse files
authored
Revert "Voice functionality added with Whisper"
1 parent 97e23d5 commit 0d6c5c2

File tree

3 files changed

+46
-113
lines changed

3 files changed

+46
-113
lines changed

README.md

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,18 @@
1111
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/self-operating-computer.png" width="750" style="margin: 10px;"/>
1212
</div>
1313

14-
## Key Features
14+
### Key Features
1515
- **Compatibility**: Designed for various multimodal models.
1616
- **Integration**: Currently integrated with **GPT-4v** as the default model.
1717
- **Future Plans**: Support for additional models.
18-
- **Accessibility**: Voice control thanks to [Whisper](https://github.com/mallorbc/whisper_mic) & [younesbram](https://github.com/younesbram)
1918

20-
21-
## Current Challenges
19+
### Current Challenges
2220
> **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
2321
24-
## Ongoing Development
22+
### Ongoing Development
2523
At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.
2624

27-
## Agent-1-Vision Model API Access
25+
### Agent-1-Vision Model API Access
2826
We will soon be offering API access to our Agent-1-Vision model.
2927

3028
If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).
@@ -91,49 +89,26 @@ operate
9189
<img src="https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png" width="300" style="margin: 10px;"/>
9290
</div>
9391

94-
## Using `operate` Modes
95-
96-
### Voice Mode
97-
- Install the additional `requirements-audio.txt`
98-
```
99-
pip install -r requirements-audio.txt
100-
```
101-
**Install device requirements**
102-
- For mac users:
103-
```
104-
brew install portaudio
105-
```
106-
- For Linux users:
107-
```
108-
sudo apt install portaudio19-dev python3-pyaudio
109-
```
110-
Run with voice mode
111-
```
112-
operate --voice
113-
114-
```
115-
116-
117-
## Contributions are Welcomed!:
92+
### Contributions are Welcomed!:
11893

11994
If you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).
12095

121-
## Feedback
96+
### Feedback
12297

12398
For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter.
12499

125-
## Join Our Discord Community
100+
### Join Our Discord Community
126101

127102
For real-time discussions and community support, join our Discord server.
128103
- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
129104
- If you're new, first [join our Discord Server](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).
130105

131-
## Follow HyperWriteAI for More Updates
106+
### Follow HyperWriteAI for More Updates
132107

133108
Stay updated with the latest developments:
134109
- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).
135110
- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).
136111

137-
## Compatibility
112+
### Compatibility
138113
- This project is compatible with Mac OS, Windows, and Linux (with X server installed).
139114

operate/main.py

Lines changed: 37 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import platform
1414
import Xlib.display
1515
import Xlib.X
16-
import Xlib.Xutil # not sure if Xutil is necessary
16+
import Xlib.Xutil # not sure if Xutil is necessary
1717

1818
from prompt_toolkit import prompt
1919
from prompt_toolkit.shortcuts import message_dialog
@@ -23,7 +23,6 @@
2323
import matplotlib.font_manager as fm
2424
from openai import OpenAI
2525
import sys
26-
from whisper_mic import WhisperMic
2726

2827

2928
load_dotenv()
@@ -97,9 +96,7 @@
9796
Objective: {objective}
9897
"""
9998

100-
ACCURATE_PIXEL_COUNT = (
101-
200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
102-
)
99+
ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
103100
ACCURATE_MODE_VISION_PROMPT = """
104101
It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
105102
As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -195,12 +192,10 @@ def supports_ansi():
195192
ANSI_BRIGHT_MAGENTA = ""
196193

197194

198-
def main(model, accurate_mode, voice_mode=False):
195+
def main(model, accurate_mode):
199196
"""
200197
Main function for the Self-Operating Computer
201198
"""
202-
# Initialize WhisperMic if voice_mode is True if voice_mode is True
203-
mic = WhisperMic() if voice_mode else None if voice_mode else None
204199

205200
message_dialog(
206201
title="Self-Operating Computer",
@@ -209,25 +204,18 @@ def main(model, accurate_mode, voice_mode=False):
209204
).run()
210205

211206
print("SYSTEM", platform.system())
212-
# Clear the console
207+
213208
if platform.system() == "Windows":
214209
os.system("cls")
215210
else:
216211
print("\033c", end="")
217212

218-
if voice_mode:
219-
print(
220-
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
221-
)
222-
try:
223-
objective = mic.listen()
224-
except Exception as e:
225-
print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
226-
return # Exit if voice input fails
227-
else:
228-
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
229-
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
230-
objective = prompt(style=style)
213+
print(f"{ANSI_GREEN}[Self-Operating Computer]\n{ANSI_RESET}{USER_QUESTION}")
214+
print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
215+
216+
objective = prompt(
217+
style=style,
218+
)
231219

232220
assistant_message = {"role": "assistant", "content": USER_QUESTION}
233221
user_message = {
@@ -328,11 +316,9 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
328316
"""
329317
Format the accurate mode vision prompt
330318
"""
331-
width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
332-
height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
333-
prompt = ACCURATE_MODE_VISION_PROMPT.format(
334-
prev_x=prev_x, prev_y=prev_y, width=width, height=height
335-
)
319+
width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
320+
height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
321+
prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
336322
return prompt
337323

338324

@@ -359,16 +345,15 @@ def get_last_assistant_message(messages):
359345
return messages[index]
360346
return None # Return None if no assistant message is found
361347

362-
363348
def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
364349
"""
365-
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
350+
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
366351
"""
367352
try:
368-
screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
369-
capture_mini_screenshot_with_cursor(
370-
file_path=screenshot_filename, x=prev_x, y=prev_y
353+
screenshot_filename = os.path.join(
354+
"screenshots", "screenshot_mini.png"
371355
)
356+
capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
372357

373358
new_screenshot_filename = os.path.join(
374359
"screenshots", "screenshot_mini_with_grid.png"
@@ -482,9 +467,7 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
482467
prev_y = click_data_json["y"]
483468

484469
if DEBUG:
485-
print(
486-
f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
487-
)
470+
print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
488471
content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
489472
assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
490473

@@ -558,7 +541,6 @@ def summarize(messages, objective):
558541
print(f"Error parsing JSON: {e}")
559542
return "Failed to summarize the workflow"
560543

561-
562544
def mouse_click(click_detail):
563545
try:
564546
x = convert_percent_to_decimal(click_detail["x"])
@@ -688,48 +670,36 @@ def search(text):
688670
return "Open program: " + text
689671

690672

691-
def capture_mini_screenshot_with_cursor(
692-
file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
693-
):
673+
def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
694674
user_platform = platform.system()
695675

696676
if user_platform == "Linux":
697-
x = float(x[:-1]) # convert x from "50%" to 50.
677+
x = float(x[:-1]) # convert x from "50%" to 50.
698678
y = float(y[:-1])
699679

700-
x = (x / 100) * monitor_size[
701-
"width"
702-
] # convert x from 50 to 0.5 * monitor_width
703-
y = (y / 100) * monitor_size["height"]
680+
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
681+
y = (y/100) * monitor_size['height']
704682

705683
# Define the coordinates for the rectangle
706-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
707-
x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
684+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
685+
x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
708686

709687
screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
710-
screenshot = screenshot.resize(
711-
(screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
712-
) # upscale the image so it's easier to see and percentage marks more visible
713-
screenshot.save(file_path)
688+
screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
689+
screenshot.save(file_path)
714690

715691
screenshots_dir = "screenshots"
716-
grid_screenshot_filename = os.path.join(
717-
screenshots_dir, "screenshot_mini_with_grid.png"
718-
)
692+
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
719693

720-
add_grid_to_image(
721-
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
722-
)
694+
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
723695
elif user_platform == "Darwin":
724-
x = float(x[:-1]) # convert x from "50%" to 50.
696+
x = float(x[:-1]) # convert x from "50%" to 50.
725697
y = float(y[:-1])
726698

727-
x = (x / 100) * monitor_size[
728-
"width"
729-
] # convert x from 50 to 0.5 * monitor_width
730-
y = (y / 100) * monitor_size["height"]
699+
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
700+
y = (y/100) * monitor_size['height']
731701

732-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
702+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
733703

734704
width = ACCURATE_PIXEL_COUNT
735705
height = ACCURATE_PIXEL_COUNT
@@ -738,17 +708,13 @@ def capture_mini_screenshot_with_cursor(
738708
subprocess.run(["screencapture", "-C", rect, file_path])
739709

740710
screenshots_dir = "screenshots"
741-
grid_screenshot_filename = os.path.join(
742-
screenshots_dir, "screenshot_mini_with_grid.png"
743-
)
711+
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
744712

745-
add_grid_to_image(
746-
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
747-
)
713+
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
748714

749715

750716
def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
751-
file_path = os.path.join("screenshots", "screenshot.png")
717+
file_path=os.path.join("screenshots", "screenshot.png")
752718
user_platform = platform.system()
753719

754720
if user_platform == "Windows":
@@ -761,7 +727,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
761727
monitor_size["width"] = size[0]
762728
monitor_size["height"] = size[1]
763729
screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
764-
screenshot.save(file_path)
730+
screenshot.save(file_path)
765731
elif user_platform == "Darwin": # (Mac OS)
766732
# Use the screencapture utility to capture the screen with the cursor
767733
subprocess.run(["screencapture", "-C", file_path])
@@ -809,13 +775,6 @@ def main_entry():
809775
default="gpt-4-vision-preview",
810776
)
811777

812-
# Add a voice flag
813-
parser.add_argument(
814-
"--voice",
815-
help="Use voice input mode",
816-
action="store_true",
817-
)
818-
819778
parser.add_argument(
820779
"-accurate",
821780
help="Activate Reflective Mouse Click Mode",
@@ -825,7 +784,7 @@ def main_entry():
825784

826785
try:
827786
args = parser.parse_args()
828-
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
787+
main(args.model, accurate_mode=args.accurate)
829788
except KeyboardInterrupt:
830789
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
831790

requirements-audio.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)