Skip to content

Commit 9efbd9b

Browse files
committed
Remove duplicate voice_mode
1 parent 1077615 commit 9efbd9b

File tree

1 file changed

+58
-39
lines changed

1 file changed

+58
-39
lines changed

operate/main.py

Lines changed: 58 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import platform
1414
import Xlib.display
1515
import Xlib.X
16-
import Xlib.Xutil # not sure if Xutil is necessary
16+
import Xlib.Xutil # not sure if Xutil is necessary
1717

1818
from prompt_toolkit import prompt
1919
from prompt_toolkit.shortcuts import message_dialog
@@ -97,7 +97,9 @@
9797
Objective: {objective}
9898
"""
9999

100-
ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100+
ACCURATE_PIXEL_COUNT = (
101+
200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
102+
)
101103
ACCURATE_MODE_VISION_PROMPT = """
102104
It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
103105
As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -193,12 +195,12 @@ def supports_ansi():
193195
ANSI_BRIGHT_MAGENTA = ""
194196

195197

196-
def main(model, accurate_mode, voice_mode=False, voice_mode=False):
198+
def main(model, accurate_mode, voice_mode=False):
197199
"""
198200
Main function for the Self-Operating Computer
199201
"""
200202
# Initialize WhisperMic if voice_mode is True if voice_mode is True
201-
mic = WhisperMic() if voice_mode else None if voice_mode else None
203+
mic = WhisperMic() if voice_mode else None if voice_mode else None
202204

203205
message_dialog(
204206
title="Self-Operating Computer",
@@ -214,7 +216,9 @@ def main(model, accurate_mode, voice_mode=False, voice_mode=False):
214216
print("\033c", end="")
215217

216218
if voice_mode:
217-
print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)")
219+
print(
220+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
221+
)
218222
try:
219223
objective = mic.listen()
220224
except Exception as e:
@@ -324,9 +328,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
324328
"""
325329
Format the accurate mode vision prompt
326330
"""
327-
width = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['width']) * 100
328-
height = ((ACCURATE_PIXEL_COUNT/2)/monitor_size['height']) * 100
329-
prompt = ACCURATE_MODE_VISION_PROMPT.format(prev_x=prev_x, prev_y=prev_y, width=width, height=height)
331+
width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["width"]) * 100
332+
height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size["height"]) * 100
333+
prompt = ACCURATE_MODE_VISION_PROMPT.format(
334+
prev_x=prev_x, prev_y=prev_y, width=width, height=height
335+
)
330336
return prompt
331337

332338

@@ -353,15 +359,16 @@ def get_last_assistant_message(messages):
353359
return messages[index]
354360
return None # Return None if no assistant message is found
355361

362+
356363
def accurate_mode_double_check(pseudo_messages, prev_x, prev_y):
357364
"""
358-
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
365+
Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
359366
"""
360367
try:
361-
screenshot_filename = os.path.join(
362-
"screenshots", "screenshot_mini.png"
368+
screenshot_filename = os.path.join("screenshots", "screenshot_mini.png")
369+
capture_mini_screenshot_with_cursor(
370+
file_path=screenshot_filename, x=prev_x, y=prev_y
363371
)
364-
capture_mini_screenshot_with_cursor(file_path=screenshot_filename, x=prev_x, y=prev_y)
365372

366373
new_screenshot_filename = os.path.join(
367374
"screenshots", "screenshot_mini_with_grid.png"
@@ -475,7 +482,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
475482
prev_y = click_data_json["y"]
476483

477484
if DEBUG:
478-
print(f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}")
485+
print(
486+
f"Previous coords before accurate tuning: prev_x {prev_x} prev_y {prev_y}"
487+
)
479488
content = accurate_mode_double_check(pseudo_messages, prev_x, prev_y)
480489
assert content != "ERROR", "ERROR: accurate_mode_double_check failed"
481490

@@ -549,6 +558,7 @@ def summarize(messages, objective):
549558
print(f"Error parsing JSON: {e}")
550559
return "Failed to summarize the workflow"
551560

561+
552562
def mouse_click(click_detail):
553563
try:
554564
x = convert_percent_to_decimal(click_detail["x"])
@@ -678,36 +688,48 @@ def search(text):
678688
return "Open program: " + text
679689

680690

681-
def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0):
691+
def capture_mini_screenshot_with_cursor(
692+
file_path=os.path.join("screenshots", "screenshot_mini.png"), x=0, y=0
693+
):
682694
user_platform = platform.system()
683695

684696
if user_platform == "Linux":
685-
x = float(x[:-1]) # convert x from "50%" to 50.
697+
x = float(x[:-1]) # convert x from "50%" to 50.
686698
y = float(y[:-1])
687699

688-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
689-
y = (y/100) * monitor_size['height']
700+
x = (x / 100) * monitor_size[
701+
"width"
702+
] # convert x from 50 to 0.5 * monitor_width
703+
y = (y / 100) * monitor_size["height"]
690704

691705
# Define the coordinates for the rectangle
692-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
693-
x2, y2 = int(x + ACCURATE_PIXEL_COUNT/2), int(y + ACCURATE_PIXEL_COUNT/2)
706+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
707+
x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)
694708

695709
screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
696-
screenshot = screenshot.resize((screenshot.width * 2, screenshot.height * 2), Image.LANCZOS) # upscale the image so it's easier to see and percentage marks more visible
697-
screenshot.save(file_path)
710+
screenshot = screenshot.resize(
711+
(screenshot.width * 2, screenshot.height * 2), Image.LANCZOS
712+
) # upscale the image so it's easier to see and percentage marks more visible
713+
screenshot.save(file_path)
698714

699715
screenshots_dir = "screenshots"
700-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
716+
grid_screenshot_filename = os.path.join(
717+
screenshots_dir, "screenshot_mini_with_grid.png"
718+
)
701719

702-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
720+
add_grid_to_image(
721+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
722+
)
703723
elif user_platform == "Darwin":
704-
x = float(x[:-1]) # convert x from "50%" to 50.
724+
x = float(x[:-1]) # convert x from "50%" to 50.
705725
y = float(y[:-1])
706726

707-
x = (x/100) * monitor_size['width'] # convert x from 50 to 0.5 * monitor_width
708-
y = (y/100) * monitor_size['height']
727+
x = (x / 100) * monitor_size[
728+
"width"
729+
] # convert x from 50 to 0.5 * monitor_width
730+
y = (y / 100) * monitor_size["height"]
709731

710-
x1, y1 = int(x - ACCURATE_PIXEL_COUNT/2), int(y - ACCURATE_PIXEL_COUNT/2)
732+
x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)
711733

712734
width = ACCURATE_PIXEL_COUNT
713735
height = ACCURATE_PIXEL_COUNT
@@ -716,13 +738,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
716738
subprocess.run(["screencapture", "-C", rect, file_path])
717739

718740
screenshots_dir = "screenshots"
719-
grid_screenshot_filename = os.path.join(screenshots_dir, "screenshot_mini_with_grid.png")
741+
grid_screenshot_filename = os.path.join(
742+
screenshots_dir, "screenshot_mini_with_grid.png"
743+
)
720744

721-
add_grid_to_image(file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT/2))
745+
add_grid_to_image(
746+
file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)
747+
)
722748

723749

724750
def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
725-
file_path=os.path.join("screenshots", "screenshot.png")
751+
file_path = os.path.join("screenshots", "screenshot.png")
726752
user_platform = platform.system()
727753

728754
if user_platform == "Windows":
@@ -735,7 +761,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
735761
monitor_size["width"] = size[0]
736762
monitor_size["height"] = size[1]
737763
screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
738-
screenshot.save(file_path)
764+
screenshot.save(file_path)
739765
elif user_platform == "Darwin": # (Mac OS)
740766
# Use the screencapture utility to capture the screen with the cursor
741767
subprocess.run(["screencapture", "-C", file_path])
@@ -796,20 +822,13 @@ def main_entry():
796822
action="store_true",
797823
required=False,
798824
)
799-
# Add a voice flag
800-
parser.add_argument(
801-
"--voice",
802-
help="Use voice input mode",
803-
action="store_true",
804-
)
805825

806826
try:
807827
args = parser.parse_args()
808-
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice, voice_mode=args.voice)
828+
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
809829
except KeyboardInterrupt:
810830
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
811831

812832

813-
814833
if __name__ == "__main__":
815834
main_entry()

0 commit comments

Comments
 (0)