1313import platform
1414import Xlib .display
1515import Xlib .X
16- import Xlib .Xutil # not sure if Xutil is necessary
16+ import Xlib .Xutil # not sure if Xutil is necessary
1717
1818from prompt_toolkit import prompt
1919from prompt_toolkit .shortcuts import message_dialog
9797Objective: {objective}
9898"""
9999
100- ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100+ ACCURATE_PIXEL_COUNT = (
101+ 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
102+ )
101103ACCURATE_MODE_VISION_PROMPT = """
102104It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
103105As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
@@ -193,12 +195,12 @@ def supports_ansi():
193195 ANSI_BRIGHT_MAGENTA = ""
194196
195197
196- def main (model , accurate_mode , voice_mode = False , voice_mode = False ):
198+ def main (model , accurate_mode , voice_mode = False ):
197199 """
198200 Main function for the Self-Operating Computer
199201 """
200202 # Initialize WhisperMic if voice_mode is True if voice_mode is True
201- mic = WhisperMic () if voice_mode else None if voice_mode else None
203+ mic = WhisperMic () if voice_mode else None if voice_mode else None
202204
203205 message_dialog (
204206 title = "Self-Operating Computer" ,
@@ -214,7 +216,9 @@ def main(model, accurate_mode, voice_mode=False, voice_mode=False):
214216 print ("\033 c" , end = "" )
215217
216218 if voice_mode :
217- print (f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RESET } Listening for your command... (speak now)" )
219+ print (
220+ f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_RESET } Listening for your command... (speak now)"
221+ )
218222 try :
219223 objective = mic .listen ()
220224 except Exception as e :
@@ -324,9 +328,11 @@ def format_accurate_mode_vision_prompt(prev_x, prev_y):
324328 """
325329 Format the accurate mode vision prompt
326330 """
327- width = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['width' ]) * 100
328- height = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['height' ]) * 100
329- prompt = ACCURATE_MODE_VISION_PROMPT .format (prev_x = prev_x , prev_y = prev_y , width = width , height = height )
331+ width = ((ACCURATE_PIXEL_COUNT / 2 ) / monitor_size ["width" ]) * 100
332+ height = ((ACCURATE_PIXEL_COUNT / 2 ) / monitor_size ["height" ]) * 100
333+ prompt = ACCURATE_MODE_VISION_PROMPT .format (
334+ prev_x = prev_x , prev_y = prev_y , width = width , height = height
335+ )
330336 return prompt
331337
332338
@@ -353,15 +359,16 @@ def get_last_assistant_message(messages):
353359 return messages [index ]
354360 return None # Return None if no assistant message is found
355361
362+
356363def accurate_mode_double_check (pseudo_messages , prev_x , prev_y ):
357364 """
358- Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
365+ Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
359366 """
360367 try :
361- screenshot_filename = os .path .join (
362- "screenshots" , "screenshot_mini.png"
368+ screenshot_filename = os .path .join ("screenshots" , "screenshot_mini.png" )
369+ capture_mini_screenshot_with_cursor (
370+ file_path = screenshot_filename , x = prev_x , y = prev_y
363371 )
364- capture_mini_screenshot_with_cursor (file_path = screenshot_filename , x = prev_x , y = prev_y )
365372
366373 new_screenshot_filename = os .path .join (
367374 "screenshots" , "screenshot_mini_with_grid.png"
@@ -475,7 +482,9 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
475482 prev_y = click_data_json ["y" ]
476483
477484 if DEBUG :
478- print (f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } " )
485+ print (
486+ f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } "
487+ )
479488 content = accurate_mode_double_check (pseudo_messages , prev_x , prev_y )
480489 assert content != "ERROR" , "ERROR: accurate_mode_double_check failed"
481490
@@ -549,6 +558,7 @@ def summarize(messages, objective):
549558 print (f"Error parsing JSON: { e } " )
550559 return "Failed to summarize the workflow"
551560
561+
552562def mouse_click (click_detail ):
553563 try :
554564 x = convert_percent_to_decimal (click_detail ["x" ])
@@ -678,36 +688,48 @@ def search(text):
678688 return "Open program: " + text
679689
680690
681- def capture_mini_screenshot_with_cursor (file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0 ):
691+ def capture_mini_screenshot_with_cursor (
692+ file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0
693+ ):
682694 user_platform = platform .system ()
683695
684696 if user_platform == "Linux" :
685- x = float (x [:- 1 ]) # convert x from "50%" to 50.
697+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
686698 y = float (y [:- 1 ])
687699
688- x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
689- y = (y / 100 ) * monitor_size ['height' ]
700+ x = (x / 100 ) * monitor_size [
701+ "width"
702+ ] # convert x from 50 to 0.5 * monitor_width
703+ y = (y / 100 ) * monitor_size ["height" ]
690704
691705 # Define the coordinates for the rectangle
692- x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
693- x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
706+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
707+ x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
694708
695709 screenshot = ImageGrab .grab (bbox = (x1 , y1 , x2 , y2 ))
696- screenshot = screenshot .resize ((screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS ) # upscale the image so it's easier to see and percentage marks more visible
697- screenshot .save (file_path )
710+ screenshot = screenshot .resize (
711+ (screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS
712+ ) # upscale the image so it's easier to see and percentage marks more visible
713+ screenshot .save (file_path )
698714
699715 screenshots_dir = "screenshots"
700- grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
716+ grid_screenshot_filename = os .path .join (
717+ screenshots_dir , "screenshot_mini_with_grid.png"
718+ )
701719
702- add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
720+ add_grid_to_image (
721+ file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 )
722+ )
703723 elif user_platform == "Darwin" :
704- x = float (x [:- 1 ]) # convert x from "50%" to 50.
724+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
705725 y = float (y [:- 1 ])
706726
707- x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
708- y = (y / 100 ) * monitor_size ['height' ]
727+ x = (x / 100 ) * monitor_size [
728+ "width"
729+ ] # convert x from 50 to 0.5 * monitor_width
730+ y = (y / 100 ) * monitor_size ["height" ]
709731
710- x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
732+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
711733
712734 width = ACCURATE_PIXEL_COUNT
713735 height = ACCURATE_PIXEL_COUNT
@@ -716,13 +738,17 @@ def capture_mini_screenshot_with_cursor(file_path=os.path.join("screenshots", "s
716738 subprocess .run (["screencapture" , "-C" , rect , file_path ])
717739
718740 screenshots_dir = "screenshots"
719- grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
741+ grid_screenshot_filename = os .path .join (
742+ screenshots_dir , "screenshot_mini_with_grid.png"
743+ )
720744
721- add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
745+ add_grid_to_image (
746+ file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 )
747+ )
722748
723749
724750def capture_screen_with_cursor (file_path = os .path .join ("screenshots" , "screenshot.png" )):
725- file_path = os .path .join ("screenshots" , "screenshot.png" )
751+ file_path = os .path .join ("screenshots" , "screenshot.png" )
726752 user_platform = platform .system ()
727753
728754 if user_platform == "Windows" :
@@ -735,7 +761,7 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
735761 monitor_size ["width" ] = size [0 ]
736762 monitor_size ["height" ] = size [1 ]
737763 screenshot = ImageGrab .grab (bbox = (0 , 0 , size [0 ], size [1 ]))
738- screenshot .save (file_path )
764+ screenshot .save (file_path )
739765 elif user_platform == "Darwin" : # (Mac OS)
740766 # Use the screencapture utility to capture the screen with the cursor
741767 subprocess .run (["screencapture" , "-C" , file_path ])
@@ -796,20 +822,13 @@ def main_entry():
796822 action = "store_true" ,
797823 required = False ,
798824 )
799- # Add a voice flag
800- parser .add_argument (
801- "--voice" ,
802- help = "Use voice input mode" ,
803- action = "store_true" ,
804- )
805825
806826 try :
807827 args = parser .parse_args ()
808- main (args .model , accurate_mode = args .accurate , voice_mode = args .voice , voice_mode = args . voice )
828+ main (args .model , accurate_mode = args .accurate , voice_mode = args .voice )
809829 except KeyboardInterrupt :
810830 print (f"\n { ANSI_BRIGHT_MAGENTA } Exiting..." )
811831
812832
813-
814833if __name__ == "__main__" :
815834 main_entry ()
0 commit comments