22import json
33import logging
44import os
5+ import time
56from copy import deepcopy
67from dataclasses import dataclass
78from io import BytesIO
@@ -340,7 +341,8 @@ def __init__(
340341 require_terminal : bool ,
341342 os_type : str ,
342343 enable_proxy : bool ,
343- max_steps : int = 50 ,
344+ max_steps : int ,
345+ exp_dir : Path ,
344346 ):
345347 self .task = task
346348 self .env_info = {
@@ -372,10 +374,15 @@ def __init__(
372374 )
373375 self ._step_count = 0
374376 self .max_steps = max_steps
377+ self .exp_dir = exp_dir
375378
376379 def reset (self , seed : int | None = None ) -> tuple [dict [str , Any ], dict [str , Any ]]:
377- raw_obs = self .env .reset (task_config = self .task , seed = seed )
378- obs = self .env_to_agentlab_observation (raw_obs )
380+ self .env .reset (task_config = self .task , seed = seed )
381+ logging .info (f"Start solving task: { self .task ['instruction' ]} " )
382+ time .sleep (60 ) # Wait for the environment to be ready, as in https://github.com/xlang-ai/OSWorld/blob/main/lib_run_single.py#L15
383+ raw_obs = self .env ._get_obs () # Get the initial observation
384+ self .env .controller .start_recording ()
385+ obs = self .to_agentlab_observation (raw_obs )
379386 self ._step_count = 0
380387 return obs , self .env_info
381388
@@ -385,7 +392,7 @@ def step(self, action: str):
385392 env_action = self .agentlab_to_env_action (action )
386393 logger .info (f"AgentLab Action returned: { action } , converted to: { env_action } " )
387394 raw_obs , reward , done , info = self .env .step (env_action )
388- logger .info (f"Task { self .task ['id' ]} Step { self ._step_count + 1 } /{ self .max_steps } done " )
395+ logger .info (f"STEP { self .task ['id' ]} { self ._step_count + 1 } /{ self .max_steps } " )
389396 self ._step_count += 1
390397 truncated = info .get ("fail" , False ) or self ._step_count >= self .max_steps
391398 if done or truncated :
@@ -398,7 +405,7 @@ def step(self, action: str):
398405 logger .info (f"Evaluated reward: { reward } " )
399406 except Exception as e :
400407 logger .error (f"Failed to evaluate { self .task } task: { e } " )
401- obs = self .env_to_agentlab_observation (raw_obs )
408+ obs = self .to_agentlab_observation (raw_obs )
402409 return obs , reward , done , truncated , info
403410
404411 def agentlab_to_env_action (self , action : str ) -> Any :
@@ -410,7 +417,7 @@ def agentlab_to_env_action(self, action: str) -> Any:
410417 "PyAutoGUI action space is not supported yet. Please use 'computer_13' action space."
411418 )
412419
413- def env_to_agentlab_observation (self , obs : dict [str , Any ]) -> dict [str , Any ]:
420+ def to_agentlab_observation (self , obs : dict [str , Any ]) -> dict [str , Any ]:
414421 """Convert OSWorld observation to AgentLab format."""
415422 converted_obs = {}
416423
@@ -467,7 +474,7 @@ def convert_agentlab_action_to_computer_13(self, action: str) -> dict[str, Any]
467474 ... snapshot_name="init_state", action_space="computer_13",
468475 ... cache_dir="cache", screen_size=(1920, 1080), headless=True,
469476 ... require_a11y_tree=True, require_terminal=False, os_type="Ubuntu",
470- ... enable_proxy=False, max_steps=50)
477+ ... enable_proxy=False, max_steps=50, exp_dir=Path(".") )
471478 >>> env.convert_agentlab_action_to_computer_13("move_to(x=100, y=200)")
472479 {'action_type': 'MOVE_TO', 'parameters': {'x': 100, 'y': 200}}
473480 >>> env.convert_agentlab_action_to_computer_13("wait()")
@@ -513,6 +520,9 @@ def parse_agentlab_action_str_to_func_args(action: str):
513520 return None , None , None
514521
515522 def close (self ):
523+ video_name = str (self .exp_dir / "recording.mp4" )
524+ self .env .controller .end_recording (video_name )
525+ logger .info (f"Recorded video saved to { video_name } " )
516526 return self .env .close ()
517527
518528
@@ -614,6 +624,7 @@ def make_env(
614624 os_type = self .os_type ,
615625 enable_proxy = self .enable_proxy ,
616626 max_steps = self .max_steps ,
627+ exp_dir = exp_dir ,
617628 )
618629 return gym
619630
0 commit comments