@@ -309,7 +309,7 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False):
309309 print (
310310 f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BLUE } Objective complete { ANSI_RESET } "
311311 )
312- summary = summarize (messages , objective )
312+ summary = summarize (model , messages , objective )
313313 print (
314314 f"{ ANSI_GREEN } [Self-Operating Computer]{ ANSI_BLUE } Summary\n { ANSI_RESET } { summary } "
315315 )
@@ -391,7 +391,7 @@ def get_next_action(model, messages, objective, accurate_mode):
391391 return "coming soon"
392392 elif model == "gemini-pro-vision" :
393393 content = get_next_action_from_gemini_pro_vision (
394- messages , objective , accurate_mode
394+ messages , objective
395395 )
396396 return content
397397
@@ -549,7 +549,7 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
549549 return "Failed take action after looking at the screenshot"
550550
551551
552- def get_next_action_from_gemini_pro_vision (messages , objective , accurate_mode ):
552+ def get_next_action_from_gemini_pro_vision (messages , objective ):
553553 """
554554 Get the next action for Self-Operating Computer using Gemini Pro Vision
555555 """
@@ -572,9 +572,6 @@ def get_next_action_from_gemini_pro_vision(messages, objective, accurate_mode):
572572 # sleep for a second
573573 time .sleep (1 )
574574
575- with open (new_screenshot_filename , "rb" ) as img_file :
576- img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
577-
578575 previous_action = get_last_assistant_message (messages )
579576
580577 vision_prompt = format_vision_prompt (objective , previous_action )
@@ -632,7 +629,7 @@ def parse_response(response):
632629 return {"type" : "UNKNOWN" , "data" : response }
633630
634631
635- def summarize (messages , objective ):
632+ def summarize (model , messages , objective ):
636633 try :
637634 screenshots_dir = "screenshots"
638635 if not os .path .exists (screenshots_dir ):
@@ -642,33 +639,40 @@ def summarize(messages, objective):
642639 # Call the function to capture the screen with the cursor
643640 capture_screen_with_cursor (screenshot_filename )
644641
645- with open (screenshot_filename , "rb" ) as img_file :
646- img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
647-
648642 summary_prompt = format_summary_prompt (objective )
643+
644+ if model == "gpt-4-vision-preview" :
645+ with open (screenshot_filename , "rb" ) as img_file :
646+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
649647
650- summary_message = {
651- "role" : "user" ,
652- "content" : [
653- {"type" : "text" , "text" : summary_prompt },
654- {
655- "type" : "image_url" ,
656- "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
657- },
658- ],
659- }
660- # create a copy of messages and save to pseudo_messages
661- messages .append (summary_message )
648+ summary_message = {
649+ "role" : "user" ,
650+ "content" : [
651+ {"type" : "text" , "text" : summary_prompt },
652+ {
653+ "type" : "image_url" ,
654+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
655+ },
656+ ],
657+ }
658+ # create a copy of messages and save to pseudo_messages
659+ messages .append (summary_message )
662660
663- response = client .chat .completions .create (
664- model = "gpt-4-vision-preview" ,
665- messages = messages ,
666- max_tokens = 500 ,
667- )
661+ response = client .chat .completions .create (
662+ model = "gpt-4-vision-preview" ,
663+ messages = messages ,
664+ max_tokens = 500 ,
665+ )
668666
669- content = response .choices [0 ].message .content
667+ content = response .choices [0 ].message .content
668+ elif model == "gemini-pro-vision" :
669+ model = genai .GenerativeModel ("gemini-pro-vision" )
670+ summary_message = model .generate_content (
671+ [summary_prompt , Image .open (screenshot_filename )]
672+ )
673+ content = summary_message .text
670674 return content
671-
675+
672676 except Exception as e :
673677 print (f"Error in summarize: { e } " )
674678 return "Failed to summarize the workflow"
0 commit comments