Skip to content

Commit c9379e1

Browse files
committed
Use gpt-4v to evalue summary screenshot
1 parent ff7f021 commit c9379e1

File tree

1 file changed

+72
-9
lines changed

1 file changed

+72
-9
lines changed

evaluate.py

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,21 @@
22
import os
33
import subprocess
44
import platform
5+
import base64
56
import openai
67

78
from dotenv import load_dotenv
89

10+
SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')
11+
12+
EVALUATION_PROMPT = """
13+
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
14+
You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings.
15+
Respond TRUE or FALSE based on whether or not the given guideline is met.
16+
17+
Guideline: {guideline}
18+
"""
19+
920
# Check if on a windows terminal that supports ANSI escape codes
1021
def supports_ansi():
1122
"""
@@ -41,14 +52,65 @@ def supports_ansi():
4152
ANSI_YELLOW = ""
4253
ANSI_RED = ""
4354
ANSI_BRIGHT_MAGENTA = ""
55+
56+
57+
def format_evaluation_prompt(guideline):
58+
prompt = EVALUATION_PROMPT.format(guideline=guideline)
59+
return prompt
60+
61+
62+
def parse_eval_content(content):
63+
if content == "TRUE":
64+
return True
65+
elif content == "FALSE":
66+
return False
67+
else:
68+
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
69+
exit(1)
4470

4571

46-
def run_test_case(prompt, guideline):
72+
def evaluate_summary_screenshot(guideline):
73+
'''Load the summary screenshot and return True or False if it meets the given guideline.'''
74+
with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file:
75+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
76+
77+
eval_message = [{
78+
"role": "user",
79+
"content": [
80+
{"type": "text", "text": format_evaluation_prompt(guideline)},
81+
{
82+
"type": "image_url",
83+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
84+
},
85+
],
86+
}]
87+
88+
response = openai.chat.completions.create(
89+
model="gpt-4-vision-preview",
90+
messages=eval_message,
91+
presence_penalty=1,
92+
frequency_penalty=1,
93+
temperature=0.7,
94+
max_tokens=300,
95+
)
96+
97+
eval_content = response.choices[0].message.content
98+
99+
return parse_eval_content(eval_content)
100+
101+
102+
def run_test_case(objective, guideline):
47103
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
48-
# Run main.py with the test case prompt
49-
subprocess.run(['operate', '--prompt', f'"{prompt}"'], stdout=subprocess.DEVNULL)
104+
# Run `operate` with the test case prompt
105+
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
106+
107+
try:
108+
result = evaluate_summary_screenshot(guideline)
109+
except(OSError):
110+
print("Couldn't open the summary screenshot")
111+
return False
50112

51-
return True
113+
return result
52114

53115

54116
def main():
@@ -57,20 +119,21 @@ def main():
57119

58120
# Define the test cases and the guidelines
59121
test_cases = {
122+
"Go to Google.com": "The Google home page is visible with the search bar.",
60123
"Open YouTube and play holiday music": "The YouTube video player is loaded and actively playing holiday music.",
61124
"Open Google Docs and write a poem": "A Google Doc file is opened in the browser with a poem typed into it.",
62125
}
63126

64127
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET} NOTE: `operate` output is silenced.")
65128

66-
for prompt, guideline in test_cases.items():
67-
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{prompt}'")
129+
for objective, guideline in test_cases.items():
130+
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
68131

69-
result = run_test_case(prompt, guideline)
132+
result = run_test_case(objective, guideline)
70133
if result:
71-
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{prompt}'")
134+
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
72135
else:
73-
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{prompt}'")
136+
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
74137

75138

76139
if __name__ == "__main__":

0 commit comments

Comments
 (0)