Skip to content

Commit d687e14

Browse files
committed
Merge branch 'voice-fixes'
2 parents 796c55b + b011fa8 commit d687e14

File tree

1 file changed

+68
-28
lines changed

1 file changed

+68
-28
lines changed

operate/main.py

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import platform
1414
import Xlib.display
1515
import Xlib.X
16-
import Xlib.Xutil # not sure if Xutil is necessary
16+
import Xlib.Xutil # not sure if Xutil is necessary
1717

1818
from prompt_toolkit import prompt
1919
from prompt_toolkit.shortcuts import message_dialog
@@ -96,7 +96,9 @@
9696
Objective: {objective}
9797
"""
9898

99-
ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
99+
ACCURATE_PIXEL_COUNT = (
100+
200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
101+
)
100102
ACCURATE_MODE_VISION_PROMPT = """
101103
It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102104
As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -196,6 +198,22 @@ def main(model, accurate_mode):
196198
"""
197199
Main function for the Self-Operating Computer
198200
"""
201+
mic = None
202+
# Initialize WhisperMic if voice_mode is True if voice_mode is True
203+
"""
204+
Main function for the Self-Operating Computer
205+
"""
206+
if voice_mode:
207+
try:
208+
from whisper_mic import WhisperMic
209+
210+
# Initialize WhisperMic if import is successful
211+
mic = WhisperMic()
212+
except ImportError:
213+
print(
214+
"Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
215+
)
216+
sys.exit(1)
199217

200218
message_dialog(
201219
title="Self-Operating Computer",
@@ -316,9 +334,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
316334
"""
317335
Format the accurate mode vision prompt
318336
"""
319-
width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
320-
height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
321-
prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
337+
width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
338+
height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
339+
prompt = ACCURATE_MODE_VISION_PROMPT.format(
340+
prev_x=prev_x, prev_y=prev_y, width=width, height=height
341+
)
322342
return prompt
323343

324344

@@ -345,15 +365,16 @@ def get_last_assistant_message(messages):
345365
return messages[index]
346366
return None # Return None if no assistant message is found
347367

368+
348369
def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
349370
"""
350-
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
371+
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351372
"""
352373
try:
353-
screenshot_filename = os.path.join(
354-
"screenshots", "screenshot_mini.png"
374+
screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
375+
capture_mini_screenshot_with_cursor(
376+
file_path=screenshot_filename, x=prev_x, y=prev_y
355377
)
356-
capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
357378

358379
new_screenshot_filename = os.path.join(
359380
"screenshots", "screenshot_mini_with_grid.png"
@@ -467,7 +488,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
467488
prev_y = click_data_json["y"]
468489

469490
if DEBUG:
470-
print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
491+
print(
492+
f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
493+
)
471494
content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
472495
assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
473496

@@ -541,6 +564,7 @@ def summarize(messages, objective):
541564
print(f"Error parsing JSON: {e}")
542565
return "Failed to summarize the workflow"
543566

567+
544568
def mouse_click(click_detail):
545569
try:
546570
x = convert_percent_to_decimal(click_detail["x"])
@@ -670,36 +694,48 @@ def search(text):
670694
return "Open program: " + text
671695

672696

673-
def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
697+
def capture_mini_screenshot_with_cursor(
698+
file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
699+
):
674700
user_platform = platform.system()
675701

676702
if user_platform == "Linux":
677-
x = float(x[:-1]) # convert x from "50%" to 50.
703+
x = float(x[:-1]) # convert x from "50%" to 50.
678704
y = float(y[:-1])
679705

680-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
681-
y = (y/100) * monitor_size['height']
706+
x = (x / 100) * monitor_size[
707+
"width"
708+
] # convert x from 50 to 0.5 * monitor_width
709+
y = (y / 100) * monitor_size["height"]
682710

683711
# Define the coordinates for the rectangle
684-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
685-
x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
712+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
713+
x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
686714

687715
screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
688-
screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
689-
screenshot.save(file_path)
716+
screenshot = screenshot.resize(
717+
(screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
718+
) # upscale the image so it's easier to see and percentage marks more visible
719+
screenshot.save(file_path)
690720

691721
screenshots_dir = "screenshots"
692-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
722+
grid_screenshot_filename = os.path.join(
723+
screenshots_dir, "screenshot_mini_with_grid.png"
724+
)
693725

694-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
726+
add_grid_to_image(
727+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
728+
)
695729
elif user_platform == "Darwin":
696-
x = float(x[:-1]) # convert x from "50%" to 50.
730+
x = float(x[:-1]) # convert x from "50%" to 50.
697731
y = float(y[:-1])
698732

699-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
700-
y = (y/100) * monitor_size['height']
733+
x = (x / 100) * monitor_size[
734+
"width"
735+
] # convert x from 50 to 0.5 * monitor_width
736+
y = (y / 100) * monitor_size["height"]
701737

702-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
738+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
703739

704740
width = ACCURATE_PIXEL_COUNT
705741
height = ACCURATE_PIXEL_COUNT
@@ -708,13 +744,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
708744
subprocess.run(["screencapture", "-C", rect, file_path])
709745

710746
screenshots_dir = "screenshots"
711-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
747+
grid_screenshot_filename = os.path.join(
748+
screenshots_dir, "screenshot_mini_with_grid.png"
749+
)
712750

713-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
751+
add_grid_to_image(
752+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
753+
)
714754

715755

716756
def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
717-
file_path=os.path.join("screenshots", "screenshot.png")
757+
file_path = os.path.join("screenshots", "screenshot.png")
718758
user_platform = platform.system()
719759

720760
if user_platform == "Windows":
@@ -727,7 +767,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
727767
monitor_size["width"] = size[0]
728768
monitor_size["height"] = size[1]
729769
screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
730-
screenshot.save(file_path)
770+
screenshot.save(file_path)
731771
elif user_platform == "Darwin": # (Mac OS)
732772
# Use the screencapture utility to capture the screen with the cursor
733773
subprocess.run(["screencapture", "-C", file_path])

0 commit comments

Comments
 (0)