1313import platform
1414import Xlib .display
1515import Xlib .X
16- import Xlib .Xutil # not sure if Xutil is necessary
16+ import Xlib .Xutil # not sure if Xutil is necessary
1717
1818from prompt_toolkit import prompt
1919from prompt_toolkit .shortcuts import message_dialog
9696Objective: {objective}
9797"""
9898
99- ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
99+ ACCURATE_PIXEL_COUNT = (
100+ 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
101+ )
100102ACCURATE_MODE_VISION_PROMPT = """
101103It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102104As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -196,6 +198,22 @@ def main(model, accurate_mode):
196198 """
197199 Main function for the Self-Operating Computer
198200 """
201+ mic = None
202+ # Initialize WhisperMic if voice_mode is True if voice_mode is True
203+ """
204+ Main function for the Self-Operating Computer
205+ """
206+ if voice_mode :
207+ try :
208+ from whisper_mic import WhisperMic
209+
210+ # Initialize WhisperMic if import is successful
211+ mic = WhisperMic ()
212+ except ImportError :
213+ print (
214+ "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
215+ )
216+ sys .exit (1 )
199217
200218 message_dialog (
201219 title = "Self-Operating Computer" ,
@@ -316,9 +334,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
316334 """
317335 Format the accurate mode vision prompt
318336 """
319- width = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['width' ]) * 100
320- height = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['height' ]) * 100
321- prompt = ACCURATE_MODE_VISION_PROMPT .format (prev_x = prev_x , prev_y = prev_y , width = width , height = height )
337+ width = ((ACCURATE_PIXEL_COUNT / 2 ) / monitor_size ["width" ]) * 100
338+ height = ((ACCURATE_PIXEL_COUNT / 2 ) / monitor_size ["height" ]) * 100
339+ prompt = ACCURATE_MODE_VISION_PROMPT .format (
340+ prev_x = prev_x , prev_y = prev_y , width = width , height = height
341+ )
322342 return prompt
323343
324344
@@ -345,15 +365,16 @@ def get_last_assistant_message(messages):
345365 return messages [index ]
346366 return None # Return None if no assistant message is found
347367
368+
348369def accurate_mode_double_check (pseudo_messages , prev_x , prev_y ):
349370 """
350- Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
371+ Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351372 """
352373 try :
353- screenshot_filename = os .path .join (
354- "screenshots" , "screenshot_mini.png"
374+ screenshot_filename = os .path .join ("screenshots" , "screenshot_mini.png" )
375+ capture_mini_screenshot_with_cursor (
376+ file_path = screenshot_filename , x = prev_x , y = prev_y
355377 )
356- capture_mini_screenshot_with_cursor (file_path = screenshot_filename , x = prev_x , y = prev_y )
357378
358379 new_screenshot_filename = os .path .join (
359380 "screenshots" , "screenshot_mini_with_grid.png"
@@ -467,7 +488,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
467488 prev_y = click_data_json ["y" ]
468489
469490 if DEBUG :
470- print (f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } " )
491+ print (
492+ f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } "
493+ )
471494 content = accurate_mode_double_check (pseudo_messages , prev_x , prev_y )
472495 assert content != "ERROR" , "ERROR: accurate_mode_double_check failed"
473496
@@ -541,6 +564,7 @@ def summarize(messages, objective):
541564 print (f"Error parsing JSON: { e } " )
542565 return "Failed to summarize the workflow"
543566
567+
544568def mouse_click (click_detail ):
545569 try :
546570 x = convert_percent_to_decimal (click_detail ["x" ])
@@ -670,36 +694,48 @@ def search(text):
670694 return "Open program: " + text
671695
672696
673- def capture_mini_screenshot_with_cursor (file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0 ):
697+ def capture_mini_screenshot_with_cursor (
698+ file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0
699+ ):
674700 user_platform = platform .system ()
675701
676702 if user_platform == "Linux" :
677- x = float (x [:- 1 ]) # convert x from "50%" to 50.
703+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
678704 y = float (y [:- 1 ])
679705
680- x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
681- y = (y / 100 ) * monitor_size ['height' ]
706+ x = (x / 100 ) * monitor_size [
707+ "width"
708+ ] # convert x from 50 to 0.5 * monitor_width
709+ y = (y / 100 ) * monitor_size ["height" ]
682710
683711 # Define the coordinates for the rectangle
684- x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
685- x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
712+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
713+ x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
686714
687715 screenshot = ImageGrab .grab (bbox = (x1 , y1 , x2 , y2 ))
688- screenshot = screenshot .resize ((screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS ) # upscale the image so it's easier to see and percentage marks more visible
689- screenshot .save (file_path )
716+ screenshot = screenshot .resize (
717+ (screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS
718+ ) # upscale the image so it's easier to see and percentage marks more visible
719+ screenshot .save (file_path )
690720
691721 screenshots_dir = "screenshots"
692- grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
722+ grid_screenshot_filename = os .path .join (
723+ screenshots_dir , "screenshot_mini_with_grid.png"
724+ )
693725
694- add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
726+ add_grid_to_image (
727+ file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 )
728+ )
695729 elif user_platform == "Darwin" :
696- x = float (x [:- 1 ]) # convert x from "50%" to 50.
730+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
697731 y = float (y [:- 1 ])
698732
699- x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
700- y = (y / 100 ) * monitor_size ['height' ]
733+ x = (x / 100 ) * monitor_size [
734+ "width"
735+ ] # convert x from 50 to 0.5 * monitor_width
736+ y = (y / 100 ) * monitor_size ["height" ]
701737
702- x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
738+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
703739
704740 width = ACCURATE_PIXEL_COUNT
705741 height = ACCURATE_PIXEL_COUNT
@@ -708,13 +744,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
708744 subprocess .run (["screencapture" , "-C" , rect , file_path ])
709745
710746 screenshots_dir = "screenshots"
711- grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
747+ grid_screenshot_filename = os .path .join (
748+ screenshots_dir , "screenshot_mini_with_grid.png"
749+ )
712750
713- add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
751+ add_grid_to_image (
752+ file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 )
753+ )
714754
715755
716756def capture_screen_with_cursor (file_path = os .path .join ("screenshots" , "screenshot.png" )):
717- file_path = os .path .join ("screenshots" , "screenshot.png" )
757+ file_path = os .path .join ("screenshots" , "screenshot.png" )
718758 user_platform = platform .system ()
719759
720760 if user_platform == "Windows" :
@@ -727,7 +767,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
727767 monitor_size ["width" ] = size [0 ]
728768 monitor_size ["height" ] = size [1 ]
729769 screenshot = ImageGrab .grab (bbox = (0 , 0 , size [0 ], size [1 ]))
730- screenshot .save (file_path )
770+ screenshot .save (file_path )
731771 elif user_platform == "Darwin" : # (Mac OS)
732772 # Use the screencapture utility to capture the screen with the cursor
733773 subprocess .run (["screencapture" , "-C" , file_path ])
0 commit comments