22import os
33import subprocess
44import platform
5+ import base64
56import openai
67
78from dotenv import load_dotenv
89
10+ SUMMARY_SCREENSHOT_PATH = os .path .join ('screenshots' , 'summary_screenshot.png' )
11+
12+ EVALUATION_PROMPT = """
13+ Your job is to look at the given screenshot and determine if the following guideline is met in the image.
14+ You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings.
15+ Respond TRUE or FALSE based on whether or not the given guideline is met.
16+
17+ Guideline: {guideline}
18+ """
19+
920# Check if on a windows terminal that supports ANSI escape codes
1021def supports_ansi ():
1122 """
@@ -41,14 +52,65 @@ def supports_ansi():
4152 ANSI_YELLOW = ""
4253 ANSI_RED = ""
4354 ANSI_BRIGHT_MAGENTA = ""
55+
56+
57+ def format_evaluation_prompt (guideline ):
58+ prompt = EVALUATION_PROMPT .format (guideline = guideline )
59+ return prompt
60+
61+
62+ def parse_eval_content (content ):
63+ if content == "TRUE" :
64+ return True
65+ elif content == "FALSE" :
66+ return False
67+ else :
68+ print ("The model gave a bad evaluation response and it couldn't be parsed. Exiting..." )
69+ exit (1 )
4470
4571
46- def run_test_case (prompt , guideline ):
72+ def evaluate_summary_screenshot (guideline ):
73+ '''Load the summary screenshot and return True or False if it meets the given guideline.'''
74+ with open (SUMMARY_SCREENSHOT_PATH , "rb" ) as img_file :
75+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
76+
77+ eval_message = [{
78+ "role" : "user" ,
79+ "content" : [
80+ {"type" : "text" , "text" : format_evaluation_prompt (guideline )},
81+ {
82+ "type" : "image_url" ,
83+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
84+ },
85+ ],
86+ }]
87+
88+ response = openai .chat .completions .create (
89+ model = "gpt-4-vision-preview" ,
90+ messages = eval_message ,
91+ presence_penalty = 1 ,
92+ frequency_penalty = 1 ,
93+ temperature = 0.7 ,
94+ max_tokens = 300 ,
95+ )
96+
97+ eval_content = response .choices [0 ].message .content
98+
99+ return parse_eval_content (eval_content )
100+
101+
102+ def run_test_case (objective , guideline ):
47103 '''Returns True if the result of the test with the given prompt meets the given guideline.'''
48- # Run main.py with the test case prompt
49- subprocess .run (['operate' , '--prompt' , f'"{ prompt } "' ], stdout = subprocess .DEVNULL )
104+ # Run `operate` with the test case prompt
105+ subprocess .run (['operate' , '--prompt' , f'"{ objective } "' ], stdout = subprocess .DEVNULL )
106+
107+ try :
108+ result = evaluate_summary_screenshot (guideline )
109+ except (OSError ):
110+ print ("Couldn't open the summary screenshot" )
111+ return False
50112
51- return True
113+ return result
52114
53115
54116def main ():
@@ -57,20 +119,21 @@ def main():
57119
58120 # Define the test cases and the guidelines
59121 test_cases = {
122+ "Go to Google.com" : "The Google home page is visible with the search bar." ,
60123 "Open YouTube and play holiday music" : "The YouTube video player is loaded and actively playing holiday music." ,
61124 "Open Google Docs and write a poem" : "A Google Doc file is opened in the browser with a poem typed into it." ,
62125 }
63126
64127 print (f"{ ANSI_BRIGHT_MAGENTA } [STARTING EVALUATION]{ ANSI_RESET } NOTE: `operate` output is silenced." )
65128
66- for prompt , guideline in test_cases .items ():
67- print (f"{ ANSI_BLUE } [EVALUATING]{ ANSI_RESET } '{ prompt } '" )
129+ for objective , guideline in test_cases .items ():
130+ print (f"{ ANSI_BLUE } [EVALUATING]{ ANSI_RESET } '{ objective } '" )
68131
69- result = run_test_case (prompt , guideline )
132+ result = run_test_case (objective , guideline )
70133 if result :
71- print (f"{ ANSI_GREEN } [PASSED]{ ANSI_RESET } '{ prompt } '" )
134+ print (f"{ ANSI_GREEN } [PASSED]{ ANSI_RESET } '{ objective } '" )
72135 else :
73- print (f"{ ANSI_RED } [FAILED]{ ANSI_RESET } '{ prompt } '" )
136+ print (f"{ ANSI_RED } [FAILED]{ ANSI_RESET } '{ objective } '" )
74137
75138
76139if __name__ == "__main__" :
0 commit comments