Code formatting

imenelydiaker · imenelydiaker · commit d827ef110681 · 2025-07-28T17:56:50.000+02:00
diff --git a/src/agentlab/agents/openai_cua/agent.py b/src/agentlab/agents/openai_cua/agent.py
@@ -6,6 +6,7 @@
 from agentlab.llm.llm_utils import image_to_jpg_base64_url
 
 import openai
+
 client = openai.OpenAI()
 
 
@@ -14,15 +15,16 @@ class OpenAIComputerUseAgentArgs(AbstractAgentArgs):
     """
     Arguments for the OpenAI Computer Use Agent.
     """
-    agent_name: str = None 
-    model: str = "computer-use-preview" 
+
+    agent_name: str = None
+    model: str = "computer-use-preview"
     tool_type: str = "computer_use_preview"
     display_width: int = 1024
     display_height: int = 768
     environment: str = "browser"
     reasoning_summary: str = "concise"
     truncation: str = "auto"  # Always set to "auto" for OpenAI API
-    action_set: HighLevelActionSetArgs = None 
+    action_set: HighLevelActionSetArgs = None
     enable_safety_checks: bool = False  # Optional, default to False, only use in demo mode
     implicit_agreement: bool = True  # Whether to require explicit agreement for actions or not
 
@@ -47,23 +49,24 @@ def make_agent(self):
             truncation=self.truncation,
             action_set=self.action_set,
             enable_safety_checks=self.enable_safety_checks,
-            implicit_agreement=self.implicit_agreement
+            implicit_agreement=self.implicit_agreement,
         )
 
 
 class OpenAIComputerUseAgent(Agent):
-    def __init__(self, 
-                 model: str, 
-                 tool_type: str,
-                 display_width: int, 
-                 display_height: int, 
-                 environment: str, 
-                 reasoning_summary: str, 
-                 truncation: str, 
-                 action_set: HighLevelActionSetArgs,
-                 enable_safety_checks: bool = False,
-                 implicit_agreement: bool = True
-                ):
+    def __init__(
+        self,
+        model: str,
+        tool_type: str,
+        display_width: int,
+        display_height: int,
+        environment: str,
+        reasoning_summary: str,
+        truncation: str,
+        action_set: HighLevelActionSetArgs,
+        enable_safety_checks: bool = False,
+        implicit_agreement: bool = True,
+    ):
         self.model = model
         self.reasoning_summary = reasoning_summary
         self.truncation = truncation
@@ -72,24 +75,24 @@ def __init__(self,
 
         self.action_set = action_set.make_action_set()
 
-        assert not self.enable_safety_checks and\
-              (self.action_set.demo_mode is not None or self.action_set.demo_mode != "off"), \
-                "Safety checks are enabled but no demo mode is set. Please set demo_mode to 'all_blue' or 'off'."
+        assert not self.enable_safety_checks and (
+            self.action_set.demo_mode is not None or self.action_set.demo_mode != "off"
+        ), "Safety checks are enabled but no demo mode is set. Please set demo_mode to 'all_blue' or 'off'."
 
         self.computer_calls = []
         self.pending_checks = []
-        self.previous_response_id = None 
-        self.last_call_id = None  
+        self.previous_response_id = None
+        self.last_call_id = None
         self.initialized = False  # Set to True to call the API on the first get_action
-        self.answer_assistant = None # Store the user answer to send to the assistant
+        self.answer_assistant = None  # Store the user answer to send to the assistant
         self.agent_info = AgentInfo()
 
         self.tools = [
             {
                 "type": tool_type,
                 "display_width": display_width,
                 "display_height": display_height,
-                "environment": environment
+                "environment": environment,
             }
         ]
         self.inputs = []
@@ -100,7 +103,7 @@ def parse_action_to_bgym(self, action) -> str:
         """
         action_type = action.type
 
-        match(action_type):
+        match (action_type):
             case "click":
                 x, y = action.x, action.y
                 button = action.button
@@ -124,11 +127,11 @@ def parse_action_to_bgym(self, action) -> str:
                         return "keyboard_press('Ctrl')"
                     else:
                         return f"keyboard_press('{k}')"
-            
+
             case "type":
                 text = action.text
                 return f"keyboard_insert_text('{text}')"
-            
+
             case "drag":
                 from_x, from_y = action.path[0].x, action.path[0].y
                 to_x, to_y = action.path[-1].x, action.path[-1].y
@@ -139,7 +142,7 @@ def parse_action_to_bgym(self, action) -> str:
                 return f"mouse_move({x}, {y})"
 
             case "wait":
-                return "noop(2000)" # wait for 2 seconds
+                return "noop(2000)"  # wait for 2 seconds
 
             # The screenshot is already given in the observation, so we don't need to handle it here.
             case "screenshot":
@@ -149,7 +152,7 @@ def parse_action_to_bgym(self, action) -> str:
             case _:
                 logging.error(f"No action found for {action_type}. Please check the action type.")
                 return None
-            
+
         return action
 
     def start_session(self, goal: str, screenshot_base64: str):
@@ -174,17 +177,11 @@ def start_session(self, goal: str, screenshot_base64: str):
         response = self.call_api(
             input=[
                 {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "input_text",
-                        "text": instruction
-                    },
-                    {
-                        "type": "input_image",
-                        "image_url": f"{screenshot_base64}"
-                    }
-                ]
+                    "role": "user",
+                    "content": [
+                        {"type": "input_text", "text": instruction},
+                        {"type": "input_image", "image_url": f"{screenshot_base64}"},
+                    ],
                 }
             ],
             reasoning={
@@ -199,8 +196,8 @@ def call_api(self, input: list, previous_response_id=None, **kwargs):
             previous_response_id=previous_response_id,
             tools=self.tools,
             input=input,
-            truncation=self.truncation, # Always set to "auto"
-            **kwargs
+            truncation=self.truncation,  # Always set to "auto"
+            **kwargs,
         )
         return response
 
@@ -218,7 +215,7 @@ def get_action(self, obs):
                     self.computer_calls.append(item)
             self.previous_response_id = response.id
             self.initialized = True
-            
+
         if len(self.computer_calls) > 0:
             logging.debug("Found multiple computer calls in previous call. Processing them...")
             computer_call = self.computer_calls.pop(0)
@@ -237,18 +234,17 @@ def get_action(self, obs):
                     "call_id": self.last_call_id,
                     "type": "computer_call_output",
                     "acknowledged_safety_checks": self.pending_checks,
-                    "output":
-                        {
-                            "type": "input_image",
-                            "image_url": f"{screenshot_base64}" # current screenshot
-                        },
+                    "output": {
+                        "type": "input_image",
+                        "image_url": f"{screenshot_base64}",  # current screenshot
+                    },
                 }
             )
 
             if self.answer_assistant:
                 self.inputs.append(self.answer_assistant)
                 self.answer_assistant = None
-            
+
             response = self.call_api(self.inputs, self.previous_response_id)
             self.previous_response_id = response.id
 
@@ -263,17 +259,12 @@ def get_action(self, obs):
                         # Always answer with: Yes, continue.
                         self.answer_assistant = {
                             "role": "user",
-                            "content": [
-                                {
-                                    "type": "input_text",
-                                    "text": "Yes, continue."
-                                }
-                            ]
+                            "content": [{"type": "input_text", "text": "Yes, continue."}],
                         }
-                        return f"send_msg_to_user(\'{item.content[0].text}\')", self.agent_info
+                        return f"send_msg_to_user('{item.content[0].text}')", self.agent_info
                 logging.debug("No action found in the response. Returning None.")
                 return None, self.agent_info
-            
+
             computer_call = self.computer_calls.pop(0)
             self.last_call_id = computer_call.call_id
             action = self.parse_action_to_bgym(computer_call.action)
@@ -293,4 +284,4 @@ def get_action(self, obs):
                     self.agent_info.think = item.summary[0].text if item.summary else None
                     break
 
-            return action, self.agent_info
+            return action, self.agent_info
diff --git a/src/agentlab/agents/openai_cua/agent_configs.py b/src/agentlab/agents/openai_cua/agent_configs.py
@@ -14,6 +14,6 @@
         subsets=("chat", "coord"),
         demo_mode=None,
     ),
-    enable_safety_checks=False, 
-    implicit_agreement=True
+    enable_safety_checks=False,
+    implicit_agreement=True,
 )

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@`
`14`	`14`	`subsets=("chat", "coord"),`
`15`	`15`	`demo_mode=None,`
`16`	`16`	`),`
`17`		`- enable_safety_checks=False,`
`18`		`- implicit_agreement=True`
	`17`	`+ enable_safety_checks=False,`
	`18`	`+ implicit_agreement=True,`
`19`	`19`	`)`