Skip to content

Commit 7f6b6c9

Browse files
Refactor observation conversion, add axtree and remove Todos.
1 parent 0dbb9dd commit 7f6b6c9

File tree

1 file changed

+16
-35
lines changed

1 file changed

+16
-35
lines changed

src/agentlab/benchmarks/osworld.py

Lines changed: 16 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -109,51 +109,33 @@ def env_to_agentlab_observation(self, obs: dict[str, Any]) -> dict[str, Any]:
109109
"""Convert OSWorld observation to AgentLab format."""
110110
converted_obs = {}
111111

112-
# Core visual and interaction components
113112
self._add_screenshot(converted_obs, obs)
114-
# TODO: Check if the unprocessesed ax_tree is a suitable representation for agentlab agents or use the utility functions from os-world agents to convert them.
115-
# TODO: Check if there is something equivalent to bid in OSWorld Axtree. and how it is used in the action space. This can be used with GenericAgent.
116-
converted_obs["axtree_object"] = obs["accessibility_tree"]
113+
# self._add_som_screenshot(converted_obs, obs) #TODO: test this
114+
converted_obs["axtree_txt"] = linearize_accessibility_tree(
115+
accessibility_tree=obs["accessibility_tree"], platform="ubuntu"
116+
)
117117
converted_obs["last_action_error"] = "" # OSWorld doesn't provide this directly
118118
converted_obs["focused_element_bid"] = "" # Extract from accessibility tree if available
119-
# Browser-like context (adapted for desktop environment)
120119
converted_obs = self._add_browser_context(converted_obs)
121-
# Task and instruction context
122120
converted_obs = self._add_task_context(converted_obs, obs)
123121

124122
return converted_obs
125123

124+
def convert_screenshot_to_numpy(self, screenshot) -> np.ndarray:
125+
"""Convert screenshot to numpy array format expected by AgentLab."""
126+
image = Image.open(BytesIO(screenshot))
127+
image = image.convert("RGB") if image.mode != "RGB" else image
128+
return np.array(image)
129+
126130
def _add_screenshot(self, converted_obs: dict[str, Any], obs: dict[str, Any]) -> None:
127131
"""Convert screenshot to numpy array format expected by AgentLab"""
128-
if "screenshot" not in obs:
129-
return
130-
131-
screenshot = obs["screenshot"]
132-
133-
try:
134-
from io import BytesIO
135-
136-
import numpy as np
137-
from PIL import Image
138-
139-
if isinstance(screenshot, bytes):
140-
image = Image.open(BytesIO(screenshot))
141-
elif hasattr(screenshot, "convert"): # PIL Image
142-
image = screenshot
143-
elif hasattr(screenshot, "__array__"): # numpy array
144-
converted_obs["screenshot"] = np.array(screenshot)
145-
return
146-
else:
147-
raise ValueError(f"Unexpected screenshot type: {type(screenshot)}")
148-
149-
# Convert PIL image to RGB numpy array
150-
if image.mode != "RGB":
151-
image = image.convert("RGB")
152-
converted_obs["screenshot"] = np.array(image)
132+
converted_obs["screenshot"] = self.convert_screenshot_to_numpy(obs["screenshot"])
153133

154-
except Exception as e:
155-
logger.warning(f"Failed to process screenshot: {e}")
156-
converted_obs["screenshot"] = None
134+
def _add_som_screenshot(self, converted_obs: dict[str, Any], obs: dict[str, Any]) -> None:
135+
"""Convert SOM screenshot to numpy array format expected by AgentLab"""
136+
masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(
137+
obs["screenshot"], obs["accessibility_tree"], platform="ubuntu")
138+
converted_obs["som_screenshot"] = self.convert_screenshot_to_numpy(tagged_screenshot)
157139

158140
def _add_browser_context(self, converted_obs: dict[str, Any]):
159141
"""Add browser-like context fields adapted for desktop environment."""
@@ -167,7 +149,6 @@ def _add_task_context(self, converted_obs: dict[str, Any], obs: dict[str, Any]):
167149
"""Add task and instruction context fields."""
168150
instruction = obs.get("instruction", "")
169151
converted_obs["goal_object"] = [{"type": "text", "text": instruction}]
170-
# Terminal output (preserve if available)
171152
if obs.get("terminal"):
172153
converted_obs["terminal_output"] = obs["terminal"]
173154
return converted_obs

0 commit comments

Comments
 (0)