Skip to content

Commit 942171e

Browse files
committed
Added summary support for Gemini Pro Vision!
1 parent d89ae30 commit 942171e

File tree

1 file changed

+33
-29
lines changed

1 file changed

+33
-29
lines changed

operate/main.py

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def main(model, accurate_mode, terminal_prompt, voice_mode=False):
309309
print(
310310
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}"
311311
)
312-
summary = summarize(messages, objective)
312+
summary = summarize(model, messages, objective)
313313
print(
314314
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}"
315315
)
@@ -391,7 +391,7 @@ def get_next_action(model, messages, objective, accurate_mode):
391391
return "coming soon"
392392
elif model == "gemini-pro-vision":
393393
content = get_next_action_from_gemini_pro_vision(
394-
messages, objective, accurate_mode
394+
messages, objective
395395
)
396396
return content
397397

@@ -549,7 +549,7 @@ def get_next_action_from_openai(messages, objective, accurate_mode):
549549
return "Failed take action after looking at the screenshot"
550550

551551

552-
def get_next_action_from_gemini_pro_vision(messages, objective, accurate_mode):
552+
def get_next_action_from_gemini_pro_vision(messages, objective):
553553
"""
554554
Get the next action for Self-Operating Computer using Gemini Pro Vision
555555
"""
@@ -572,9 +572,6 @@ def get_next_action_from_gemini_pro_vision(messages, objective, accurate_mode):
572572
# sleep for a second
573573
time.sleep(1)
574574

575-
with open(new_screenshot_filename, "rb") as img_file:
576-
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
577-
578575
previous_action = get_last_assistant_message(messages)
579576

580577
vision_prompt = format_vision_prompt(objective, previous_action)
@@ -632,7 +629,7 @@ def parse_response(response):
632629
return {"type": "UNKNOWN", "data": response}
633630

634631

635-
def summarize(messages, objective):
632+
def summarize(model, messages, objective):
636633
try:
637634
screenshots_dir = "screenshots"
638635
if not os.path.exists(screenshots_dir):
@@ -642,33 +639,40 @@ def summarize(messages, objective):
642639
# Call the function to capture the screen with the cursor
643640
capture_screen_with_cursor(screenshot_filename)
644641

645-
with open(screenshot_filename, "rb") as img_file:
646-
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
647-
648642
summary_prompt = format_summary_prompt(objective)
643+
644+
if model == "gpt-4-vision-preview":
645+
with open(screenshot_filename, "rb") as img_file:
646+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
649647

650-
summary_message = {
651-
"role": "user",
652-
"content": [
653-
{"type": "text", "text": summary_prompt},
654-
{
655-
"type": "image_url",
656-
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
657-
},
658-
],
659-
}
660-
# create a copy of messages and save to pseudo_messages
661-
messages.append(summary_message)
648+
summary_message = {
649+
"role": "user",
650+
"content": [
651+
{"type": "text", "text": summary_prompt},
652+
{
653+
"type": "image_url",
654+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
655+
},
656+
],
657+
}
658+
# create a copy of messages and save to pseudo_messages
659+
messages.append(summary_message)
662660

663-
response = client.chat.completions.create(
664-
model="gpt-4-vision-preview",
665-
messages=messages,
666-
max_tokens=500,
667-
)
661+
response = client.chat.completions.create(
662+
model="gpt-4-vision-preview",
663+
messages=messages,
664+
max_tokens=500,
665+
)
668666

669-
content = response.choices[0].message.content
667+
content = response.choices[0].message.content
668+
elif model == "gemini-pro-vision":
669+
model = genai.GenerativeModel("gemini-pro-vision")
670+
summary_message = model.generate_content(
671+
[summary_prompt, Image.open(screenshot_filename)]
672+
)
673+
content = summary_message.text
670674
return content
671-
675+
672676
except Exception as e:
673677
print(f"Error in summarize: {e}")
674678
return "Failed to summarize the workflow"

0 commit comments

Comments
 (0)