File tree Expand file tree Collapse file tree 1 file changed +14
-7
lines changed Expand file tree Collapse file tree 1 file changed +14
-7
lines changed Original file line number Diff line number Diff line change 33import subprocess
44import platform
55import base64
6+ import json
67import openai
78
89from dotenv import load_dotenv
1415
1516EVALUATION_PROMPT = """
1617Your job is to look at the given screenshot and determine if the following guideline is met in the image.
17- You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings.
18- Respond TRUE or FALSE based on whether or not the given guideline is met.
18+ You must respond in the following format ONLY. Do not add anything else:
19+ {{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
20+ guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
21+ reason must be a string containing a justification for your decision.
1922
2023Guideline: {guideline}
2124"""
@@ -65,11 +68,13 @@ def format_evaluation_prompt(guideline):
6568
6669
6770def parse_eval_content (content ):
68- if content == "TRUE" :
69- return True
70- elif content == "FALSE" :
71- return False
72- else :
71+ try :
72+ res = json .loads (content )
73+
74+ print (res ["reason" ])
75+
76+ return res ["guideline_met" ]
77+ except :
7378 print ("The model gave a bad evaluation response and it couldn't be parsed. Exiting..." )
7479 exit (1 )
7580
@@ -131,8 +136,10 @@ def main():
131136 result = run_test_case (objective , guideline )
132137 if result :
133138 print (f"{ ANSI_GREEN } [PASSED]{ ANSI_RESET } '{ objective } '" )
139+ passed += 1
134140 else :
135141 print (f"{ ANSI_RED } [FAILED]{ ANSI_RESET } '{ objective } '" )
142+ failed += 1
136143
137144 print (
138145 f"{ ANSI_BRIGHT_MAGENTA } [EVALUATION COMPLETE]{ ANSI_RESET } { passed } tests passed, { failed } tests failed"
You can’t perform that action at this time.
0 commit comments