1- from dataclasses import dataclass
21import logging
2+ import os
3+ from dataclasses import dataclass
34
5+ import openai
46from bgym import HighLevelActionSetArgs
57from browsergym .experiments import AbstractAgentArgs , Agent , AgentInfo
6- from agentlab .llm .llm_utils import image_to_jpg_base64_url
78
8- import openai
9+ from agentlab . llm . llm_utils import image_to_jpg_base64_url
910
10- client = openai .OpenAI ()
11+ client = openai .OpenAI (api_key = os . getenv ( "OPENAI_API_KEY" ) )
1112
1213
1314@dataclass
@@ -75,9 +76,10 @@ def __init__(
7576
7677 self .action_set = action_set .make_action_set ()
7778
78- assert not self .enable_safety_checks and (
79- self .action_set .demo_mode is not None or self .action_set .demo_mode != "off"
80- ), "Safety checks are enabled but no demo mode is set. Please set demo_mode to 'all_blue' or 'off'."
79+ assert not (
80+ self .enable_safety_checks
81+ and (self .action_set .demo_mode is None or self .action_set .demo_mode == "off" )
82+ ), "Safety checks are enabled but no demo mode is set. Please set demo_mode to 'all_blue'."
8183
8284 self .computer_calls = []
8385 self .pending_checks = []
@@ -118,15 +120,21 @@ def parse_action_to_bgym(self, action) -> str:
118120
119121 case "keypress" :
120122 keys = action .keys
123+ to_press = ""
121124 for k in keys :
122125 if k .lower () == "enter" :
123- return "keyboard_press(' Enter') "
126+ to_press = " Enter"
124127 elif k .lower () == "space" :
125- return "keyboard_press(' ') "
128+ to_press = " "
126129 elif k .lower () == "ctrl" :
127- return "keyboard_press('Ctrl')"
130+ to_press = "Ctrl"
131+ elif k .lower () == "shift" :
132+ to_press = "Shift"
133+ elif k .lower () == "alt" :
134+ to_press = "Alt"
128135 else :
129- return f"keyboard_press('{ k } ')"
136+ to_press += f"+{ k } "
137+ return f"keyboard_press('{ to_press } ')"
130138
131139 case "type" :
132140 text = action .text
@@ -150,7 +158,7 @@ def parse_action_to_bgym(self, action) -> str:
150158 return "noop()"
151159
152160 case _:
153- logging .error (f"No action found for { action_type } . Please check the action type." )
161+ logging .debug (f"No action found for { action_type } . Please check the action type." )
154162 return None
155163
156164 return action
@@ -206,7 +214,7 @@ def get_action(self, obs):
206214 screenshot_base64 = image_to_jpg_base64_url (obs ["screenshot" ])
207215
208216 if not self .initialized :
209- print ("Initializing OpenAI Computer Use Agent with goal:" , goal )
217+ logging . debug ("Initializing OpenAI Computer Use Agent with goal:" , goal )
210218 response = self .start_session (goal , screenshot_base64 )
211219 for item in response .output :
212220 if item .type == "reasoning" :
@@ -222,7 +230,6 @@ def get_action(self, obs):
222230 if not self .enable_safety_checks :
223231 # Bypass safety checks
224232 self .pending_checks = computer_call .pending_safety_checks
225- print (f"Pending safety checks: { self .pending_checks } " )
226233 action = self .parse_action_to_bgym (computer_call .action )
227234 self .last_call_id = computer_call .call_id
228235 return action , self .agent_info
@@ -245,7 +252,9 @@ def get_action(self, obs):
245252 self .inputs .append (self .answer_assistant )
246253 self .answer_assistant = None
247254
255+ self .agent_info .chat_messages = str (self .inputs )
248256 response = self .call_api (self .inputs , self .previous_response_id )
257+ self .inputs = [] # Clear inputs for the next call
249258 self .previous_response_id = response .id
250259
251260 self .computer_calls = [item for item in response .output if item .type == "computer_call" ]
0 commit comments