2424Guideline: {guideline}
2525"""
2626
27- SUMMARY_SCREENSHOT_PATH = os .path .join ('screenshots' , 'summary_screenshot .png' )
27+ SCREENSHOT_PATH = os .path .join ('screenshots' , 'screenshot .png' )
2828
2929# Check if on a windows terminal that supports ANSI escape codes
3030def supports_ansi ():
@@ -80,9 +80,9 @@ def parse_eval_content(content):
8080 exit (1 )
8181
8282
83- def evaluate_summary_screenshot (guideline ):
84- '''Load the summary screenshot and return True or False if it meets the given guideline.'''
85- with open (SUMMARY_SCREENSHOT_PATH , "rb" ) as img_file :
83+ def evaluate_final_screenshot (guideline ):
84+ '''Load the final screenshot and return True or False if it meets the given guideline.'''
85+ with open (SCREENSHOT_PATH , "rb" ) as img_file :
8686 img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
8787
8888 eval_message = [{
@@ -116,9 +116,9 @@ def run_test_case(objective, guideline):
116116 subprocess .run (['operate' , '--prompt' , f'"{ objective } "' ], stdout = subprocess .DEVNULL )
117117
118118 try :
119- result = evaluate_summary_screenshot (guideline )
119+ result = evaluate_final_screenshot (guideline )
120120 except (OSError ):
121- print ("Couldn't open the summary screenshot" )
121+ print ("[Error] Couldn't open the screenshot for evaluation " )
122122 return False
123123
124124 return result
@@ -143,7 +143,7 @@ def main():
143143 failed += 1
144144
145145 print (
146- f"{ ANSI_BRIGHT_MAGENTA } [EVALUATION COMPLETE]{ ANSI_RESET } { passed } tests passed, { failed } tests failed"
146+ f"{ ANSI_BRIGHT_MAGENTA } [EVALUATION COMPLETE]{ ANSI_RESET } { passed } test { '' if passed == 1 else 's' } passed , { failed } test { '' if failed == 1 else 's' } failed"
147147 )
148148
149149if __name__ == "__main__" :
0 commit comments