Skip to content

Commit 519197e

Browse files
committed
swap out for gpt-4o
1 parent f1f0a4b commit 519197e

File tree

3 files changed

+64
-53
lines changed

3 files changed

+64
-53
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
</div>
1313

1414
<!--
15-
:rotating_light: **OUTAGE NOTIFICATION: gpt-4-vision-preview**
15+
:rotating_light: **OUTAGE NOTIFICATION: gpt-4o**
1616
**This model is currently experiencing an outage so the self-operating computer may not work as expected.**
1717
-->
1818

@@ -176,5 +176,5 @@ Stay updated with the latest developments:
176176
- This project is compatible with Mac OS, Windows, and Linux (with X server installed).
177177

178178
## OpenAI Rate Limiting Note
179-
The ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
179+
The ```gpt-4o``` model is required. To unlock access to this model, your account needs to spend at least \$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \$5.
180180
Learn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**

evaluate.py

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
Guideline: {guideline}
2626
"""
2727

28-
SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')
28+
SCREENSHOT_PATH = os.path.join("screenshots", "screenshot.png")
29+
2930

3031
# Check if on a windows terminal that supports ANSI escape codes
3132
def supports_ansi():
@@ -37,6 +38,7 @@ def supports_ansi():
3738
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
3839
return supported_platform and is_a_tty
3940

41+
4042
if supports_ansi():
4143
# Standard green text
4244
ANSI_GREEN = "\033[32m"
@@ -62,8 +64,8 @@ def supports_ansi():
6264
ANSI_YELLOW = ""
6365
ANSI_RED = ""
6466
ANSI_BRIGHT_MAGENTA = ""
65-
66-
67+
68+
6769
def format_evaluation_prompt(guideline):
6870
prompt = EVALUATION_PROMPT.format(guideline=guideline)
6971
return prompt
@@ -72,33 +74,37 @@ def format_evaluation_prompt(guideline):
7274
def parse_eval_content(content):
7375
try:
7476
res = json.loads(content)
75-
77+
7678
print(res["reason"])
77-
79+
7880
return res["guideline_met"]
7981
except:
80-
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
82+
print(
83+
"The model gave a bad evaluation response and it couldn't be parsed. Exiting..."
84+
)
8185
exit(1)
8286

8387

8488
def evaluate_final_screenshot(guideline):
85-
'''Load the final screenshot and return True or False if it meets the given guideline.'''
89+
"""Load the final screenshot and return True or False if it meets the given guideline."""
8690
with open(SCREENSHOT_PATH, "rb") as img_file:
8791
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
8892

89-
eval_message = [{
90-
"role": "user",
91-
"content": [
92-
{"type": "text", "text": format_evaluation_prompt(guideline)},
93-
{
94-
"type": "image_url",
95-
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
96-
},
97-
],
98-
}]
99-
93+
eval_message = [
94+
{
95+
"role": "user",
96+
"content": [
97+
{"type": "text", "text": format_evaluation_prompt(guideline)},
98+
{
99+
"type": "image_url",
100+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
101+
},
102+
],
103+
}
104+
]
105+
100106
response = openai.chat.completions.create(
101-
model="gpt-4-vision-preview",
107+
model="gpt-4o",
102108
messages=eval_message,
103109
presence_penalty=1,
104110
frequency_penalty=1,
@@ -107,53 +113,57 @@ def evaluate_final_screenshot(guideline):
107113
)
108114

109115
eval_content = response.choices[0].message.content
110-
116+
111117
return parse_eval_content(eval_content)
112118

113119

114120
def run_test_case(objective, guideline, model):
115-
'''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
121+
"""Returns True if the result of the test with the given prompt meets the given guideline for the given model."""
116122
# Run `operate` with the model to evaluate and the test case prompt
117-
subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
118-
123+
subprocess.run(
124+
["operate", "-m", model, "--prompt", f'"{objective}"'],
125+
stdout=subprocess.DEVNULL,
126+
)
127+
119128
try:
120129
result = evaluate_final_screenshot(guideline)
121-
except(OSError):
130+
except OSError:
122131
print("[Error] Couldn't open the screenshot for evaluation")
123132
return False
124-
133+
125134
return result
126135

127136

128137
def get_test_model():
129138
parser = argparse.ArgumentParser(
130139
description="Run the self-operating-computer with a specified model."
131140
)
132-
141+
133142
parser.add_argument(
134143
"-m",
135144
"--model",
136145
help="Specify the model to evaluate.",
137146
required=False,
138147
default="gpt-4-with-ocr",
139148
)
140-
149+
141150
return parser.parse_args().model
142151

143152

144153
def main():
145154
load_dotenv()
146155
openai.api_key = os.getenv("OPENAI_API_KEY")
147-
156+
148157
model = get_test_model()
149-
158+
150159
print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
151160
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
152161

153-
passed = 0; failed = 0
162+
passed = 0
163+
failed = 0
154164
for objective, guideline in TEST_CASES.items():
155165
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
156-
166+
157167
result = run_test_case(objective, guideline, model)
158168
if result:
159169
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
@@ -166,5 +176,6 @@ def main():
166176
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
167177
)
168178

179+
169180
if __name__ == "__main__":
170181
main()

operate/models/apis.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ async def get_next_action(model, messages, objective, session_id):
4141
print("[Self-Operating Computer][get_next_action]")
4242
print("[Self-Operating Computer][get_next_action] model", model)
4343
if model == "gpt-4":
44-
return call_gpt_4_vision_preview(messages), None
44+
return call_gpt_4o(messages), None
4545
if model == "gpt-4-with-som":
46-
operation = await call_gpt_4_vision_preview_labeled(messages, objective, model)
46+
operation = await call_gpt_4o_labeled(messages, objective, model)
4747
return operation, None
4848
if model == "gpt-4-with-ocr":
49-
operation = await call_gpt_4_vision_preview_ocr(messages, objective, model)
49+
operation = await call_gpt_4o_with_ocr(messages, objective, model)
5050
return operation, None
5151
if model == "agent-1":
5252
return "coming soon"
@@ -61,7 +61,7 @@ async def get_next_action(model, messages, objective, session_id):
6161
raise ModelNotRecognizedException(model)
6262

6363

64-
def call_gpt_4_vision_preview(messages):
64+
def call_gpt_4o(messages):
6565
if config.verbose:
6666
print("[call_gpt_4_v]")
6767
time.sleep(1)
@@ -102,7 +102,7 @@ def call_gpt_4_vision_preview(messages):
102102
messages.append(vision_message)
103103

104104
response = client.chat.completions.create(
105-
model="gpt-4-vision-preview",
105+
model="gpt-4o",
106106
messages=messages,
107107
presence_penalty=1,
108108
frequency_penalty=1,
@@ -137,7 +137,7 @@ def call_gpt_4_vision_preview(messages):
137137
)
138138
if config.verbose:
139139
traceback.print_exc()
140-
return call_gpt_4_vision_preview(messages)
140+
return call_gpt_4o(messages)
141141

142142

143143
def call_gemini_pro_vision(messages, objective):
@@ -189,12 +189,12 @@ def call_gemini_pro_vision(messages, objective):
189189
if config.verbose:
190190
print("[Self-Operating Computer][Operate] error", e)
191191
traceback.print_exc()
192-
return call_gpt_4_vision_preview(messages)
192+
return call_gpt_4o(messages)
193193

194194

195-
async def call_gpt_4_vision_preview_ocr(messages, objective, model):
195+
async def call_gpt_4o_with_ocr(messages, objective, model):
196196
if config.verbose:
197-
print("[call_gpt_4_vision_preview_ocr]")
197+
print("[call_gpt_4o_with_ocr]")
198198

199199
# Construct the path to the file within the package
200200
try:
@@ -231,7 +231,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
231231
messages.append(vision_message)
232232

233233
response = client.chat.completions.create(
234-
model="gpt-4-vision-preview",
234+
model="gpt-4o",
235235
messages=messages,
236236
temperature=0.7,
237237
max_tokens=3000,
@@ -253,7 +253,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
253253
text_to_click = operation.get("text")
254254
if config.verbose:
255255
print(
256-
"[call_gpt_4_vision_preview_ocr][click] text_to_click",
256+
"[call_gpt_4o_with_ocr][click] text_to_click",
257257
text_to_click,
258258
)
259259
# Initialize EasyOCR Reader
@@ -275,15 +275,15 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
275275

276276
if config.verbose:
277277
print(
278-
"[call_gpt_4_vision_preview_ocr][click] text_element_index",
278+
"[call_gpt_4o_with_ocr][click] text_element_index",
279279
text_element_index,
280280
)
281281
print(
282-
"[call_gpt_4_vision_preview_ocr][click] coordinates",
282+
"[call_gpt_4o_with_ocr][click] coordinates",
283283
coordinates,
284284
)
285285
print(
286-
"[call_gpt_4_vision_preview_ocr][click] final operation",
286+
"[call_gpt_4o_with_ocr][click] final operation",
287287
operation,
288288
)
289289
processed_content.append(operation)
@@ -307,7 +307,7 @@ async def call_gpt_4_vision_preview_ocr(messages, objective, model):
307307
return gpt_4_fallback(messages, objective, model)
308308

309309

310-
async def call_gpt_4_vision_preview_labeled(messages, objective, model):
310+
async def call_gpt_4o_labeled(messages, objective, model):
311311
time.sleep(1)
312312

313313
try:
@@ -355,7 +355,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
355355
messages.append(vision_message)
356356

357357
response = client.chat.completions.create(
358-
model="gpt-4-vision-preview",
358+
model="gpt-4o",
359359
messages=messages,
360360
presence_penalty=1,
361361
frequency_penalty=1,
@@ -415,7 +415,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
415415
print(
416416
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] Failed to get click position in percent. Trying another method {ANSI_RESET}"
417417
)
418-
return call_gpt_4_vision_preview(messages)
418+
return call_gpt_4o(messages)
419419

420420
x_percent = f"{click_position_percent[0]:.2f}"
421421
y_percent = f"{click_position_percent[1]:.2f}"
@@ -450,7 +450,7 @@ async def call_gpt_4_vision_preview_labeled(messages, objective, model):
450450
if config.verbose:
451451
print("[Self-Operating Computer][Operate] error", e)
452452
traceback.print_exc()
453-
return call_gpt_4_vision_preview(messages)
453+
return call_gpt_4o(messages)
454454

455455

456456
def call_ollama_llava(messages):
@@ -742,7 +742,7 @@ def get_last_assistant_message(messages):
742742
def gpt_4_fallback(messages, objective, model):
743743
if config.verbose:
744744
print("[gpt_4_fallback]")
745-
system_prompt = get_system_prompt("gpt-4-vision-preview", objective)
745+
system_prompt = get_system_prompt("gpt-4o", objective)
746746
new_system_message = {"role": "system", "content": system_prompt}
747747
# remove and replace the first message in `messages` with `new_system_message`
748748

@@ -752,7 +752,7 @@ def gpt_4_fallback(messages, objective, model):
752752
print("[gpt_4_fallback][updated]")
753753
print("[gpt_4_fallback][updated] len(messages)", len(messages))
754754

755-
return call_gpt_4_vision_preview(messages)
755+
return call_gpt_4o(messages)
756756

757757

758758
def confirm_system_prompt(messages, objective, model):

0 commit comments

Comments
 (0)