Skip to content

Commit 8cbd372

Browse files
committed
Add evaluation justification
1 parent 138012a commit 8cbd372

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

evaluate.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import subprocess
44
import platform
55
import base64
6+
import json
67
import openai
78

89
from dotenv import load_dotenv
@@ -14,8 +15,10 @@
1415

1516
EVALUATION_PROMPT = """
1617
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
17-
You can only respond in one of two possible ways: 'TRUE' or 'FALSE' with those exact spellings.
18-
Respond TRUE or FALSE based on whether or not the given guideline is met.
18+
You must respond in the following format ONLY. Do not add anything else:
19+
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
20+
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
21+
reason must be a string containing a justification for your decision.
1922
2023
Guideline: {guideline}
2124
"""
@@ -65,11 +68,13 @@ def format_evaluation_prompt(guideline):
6568

6669

6770
def parse_eval_content(content):
68-
if content == "TRUE":
69-
return True
70-
elif content == "FALSE":
71-
return False
72-
else:
71+
try:
72+
res = json.loads(content)
73+
74+
print(res["reason"])
75+
76+
return res["guideline_met"]
77+
except:
7378
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
7479
exit(1)
7580

@@ -131,8 +136,10 @@ def main():
131136
result = run_test_case(objective, guideline)
132137
if result:
133138
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
139+
passed += 1
134140
else:
135141
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
142+
failed += 1
136143

137144
print(
138145
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed"

0 commit comments

Comments
 (0)