@@ -109,51 +109,33 @@ def env_to_agentlab_observation(self, obs: dict[str, Any]) -> dict[str, Any]:
109109 """Convert OSWorld observation to AgentLab format."""
110110 converted_obs = {}
111111
112- # Core visual and interaction components
113112 self ._add_screenshot (converted_obs , obs )
114- # TODO: Check if the unprocessesed ax_tree is a suitable representation for agentlab agents or use the utility functions from os-world agents to convert them.
115- # TODO: Check if there is something equivalent to bid in OSWorld Axtree. and how it is used in the action space. This can be used with GenericAgent.
116- converted_obs ["axtree_object" ] = obs ["accessibility_tree" ]
113+ # self._add_som_screenshot(converted_obs, obs) #TODO: test this
114+ converted_obs ["axtree_txt" ] = linearize_accessibility_tree (
115+ accessibility_tree = obs ["accessibility_tree" ], platform = "ubuntu"
116+ )
117117 converted_obs ["last_action_error" ] = "" # OSWorld doesn't provide this directly
118118 converted_obs ["focused_element_bid" ] = "" # Extract from accessibility tree if available
119- # Browser-like context (adapted for desktop environment)
120119 converted_obs = self ._add_browser_context (converted_obs )
121- # Task and instruction context
122120 converted_obs = self ._add_task_context (converted_obs , obs )
123121
124122 return converted_obs
125123
124+ def convert_screenshot_to_numpy (self , screenshot ) -> np .ndarray :
125+ """Convert screenshot to numpy array format expected by AgentLab."""
126+ image = Image .open (BytesIO (screenshot ))
127+ image = image .convert ("RGB" ) if image .mode != "RGB" else image
128+ return np .array (image )
129+
126130 def _add_screenshot (self , converted_obs : dict [str , Any ], obs : dict [str , Any ]) -> None :
127131 """Convert screenshot to numpy array format expected by AgentLab"""
128- if "screenshot" not in obs :
129- return
130-
131- screenshot = obs ["screenshot" ]
132-
133- try :
134- from io import BytesIO
135-
136- import numpy as np
137- from PIL import Image
138-
139- if isinstance (screenshot , bytes ):
140- image = Image .open (BytesIO (screenshot ))
141- elif hasattr (screenshot , "convert" ): # PIL Image
142- image = screenshot
143- elif hasattr (screenshot , "__array__" ): # numpy array
144- converted_obs ["screenshot" ] = np .array (screenshot )
145- return
146- else :
147- raise ValueError (f"Unexpected screenshot type: { type (screenshot )} " )
148-
149- # Convert PIL image to RGB numpy array
150- if image .mode != "RGB" :
151- image = image .convert ("RGB" )
152- converted_obs ["screenshot" ] = np .array (image )
132+ converted_obs ["screenshot" ] = self .convert_screenshot_to_numpy (obs ["screenshot" ])
153133
154- except Exception as e :
155- logger .warning (f"Failed to process screenshot: { e } " )
156- converted_obs ["screenshot" ] = None
134+ def _add_som_screenshot (self , converted_obs : dict [str , Any ], obs : dict [str , Any ]) -> None :
135+ """Convert SOM screenshot to numpy array format expected by AgentLab"""
136+ masks , drew_nodes , tagged_screenshot , linearized_accessibility_tree = tag_screenshot (
137+ obs ["screenshot" ], obs ["accessibility_tree" ], platform = "ubuntu" )
138+ converted_obs ["som_screenshot" ] = self .convert_screenshot_to_numpy (tagged_screenshot )
157139
158140 def _add_browser_context (self , converted_obs : dict [str , Any ]):
159141 """Add browser-like context fields adapted for desktop environment."""
@@ -167,7 +149,6 @@ def _add_task_context(self, converted_obs: dict[str, Any], obs: dict[str, Any]):
167149 """Add task and instruction context fields."""
168150 instruction = obs .get ("instruction" , "" )
169151 converted_obs ["goal_object" ] = [{"type" : "text" , "text" : instruction }]
170- # Terminal output (preserve if available)
171152 if obs .get ("terminal" ):
172153 converted_obs ["terminal_output" ] = obs ["terminal" ]
173154 return converted_obs
0 commit comments