Skip to content

Commit ce9dfaf

Browse files
committed
first test for saving study differently still need to make it work
1 parent 8dc809c commit ce9dfaf

File tree

5 files changed

+201
-43
lines changed

5 files changed

+201
-43
lines changed

src/agentlab/experiments/exp_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pathlib import Path
77
from time import sleep, time
88

9-
from browsergym.experiments.loop import ExpArgs, _move_old_exp, yield_all_exp_results
9+
from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
1010
from tqdm import tqdm
1111

1212
logger = logging.getLogger(__name__) # Get logger based on module name
@@ -25,6 +25,12 @@
2525
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
2626

2727

28+
def move_old_exp(exp_dir):
29+
"""Move the old experiment directory to a new name."""
30+
exp_dir = Path(exp_dir)
31+
if exp_dir.exists():
32+
exp_dir.rename(exp_dir.with_name("_" + exp_dir.name))
33+
2834
def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
2935
"""Run exp_args.run() with a timeout and handle dependencies."""
3036
# episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
@@ -186,6 +192,6 @@ def hide_some_exp(base_dir, filter: callable, just_test):
186192
for exp in exp_list:
187193
if filter(exp):
188194
if not just_test:
189-
_move_old_exp(exp.exp_dir)
195+
move_old_exp(exp.exp_dir)
190196
filtered_out.append(exp)
191197
return filtered_out

src/agentlab/experiments/launch_exp.py

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import bgym
66
from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
77

8-
from agentlab.experiments.exp_utils import run_exp
8+
from agentlab.experiments.exp_utils import run_exp, move_old_exp
99

1010

1111
def run_experiments(
@@ -70,15 +70,6 @@ def run_experiments(
7070
for exp_args in exp_args_list
7171
)
7272

73-
# dask will be deprecated, as there was issues. use ray instead
74-
# elif parallel_backend == "dask":
75-
# from agentlab.experiments.graph_execution_dask import (
76-
# execute_task_graph,
77-
# make_dask_client,
78-
# )
79-
80-
# with make_dask_client(n_worker=n_jobs):
81-
# execute_task_graph(exp_args_list)
8273
elif parallel_backend == "ray":
8374
from agentlab.experiments.graph_execution_ray import execute_task_graph, ray
8475

@@ -101,7 +92,7 @@ def run_experiments(
10192
logging.info("Experiment finished.")
10293

10394

104-
def find_incomplete(study_dir: str | Path, include_errors=True):
95+
def prepare_study_for_relaunch(study_dir: str | Path, include_errors=True):
10596
"""Find all incomplete experiments for relaunching.
10697
10798
Note: completed experiments are kept but are replaced by dummy exp_args
@@ -130,7 +121,7 @@ def find_incomplete(study_dir: str | Path, include_errors=True):
130121
)
131122

132123
exp_result_list = list(yield_all_exp_results(study_dir, progress_fn=None))
133-
exp_args_list = [_hide_completed(exp_result, include_errors) for exp_result in exp_result_list]
124+
exp_args_list = [prepare_exp_for_relaunch(exp_result, include_errors) for exp_result in exp_result_list]
134125
# sort according to exp_args.order
135126
exp_args_list.sort(key=lambda exp_args: exp_args.order if exp_args.order is not None else 0)
136127

@@ -158,11 +149,18 @@ def noop(*args, **kwargs):
158149
pass
159150

160151

161-
def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
162-
"""Hide completed experiments from the list.
163-
164-
This little hack, allows an elegant way to keep the task dependencies for e.g. webarena
165-
while skipping the tasks that are completed when relaunching.
152+
def prepare_exp_for_relaunch(exp_result: bgym.ExpResult, include_errors: bool = True):
153+
"""Prepare an experiment for relaunching.
154+
155+
Based on the status, determine if it needs to be relaunched.
156+
if relaunch:
157+
move old exp_dir to _{exp_dir}
158+
if bypass:
159+
keep the exp_args in the list for the task dependencies but make it a dummy that will just
160+
execute nothing.
161+
162+
This bypass hack, allows an elegant way to keep the task dependencies for e.g. webarena while
163+
skipping the tasks that are completed when relaunching.
166164
167165
Args:
168166
exp_result: bgym.ExpResult
@@ -175,19 +173,23 @@ def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
175173
The ExpArgs object hidden if the experiment is completed.
176174
"""
177175

178-
hide = False
176+
bypass = False
179177
if exp_result.status == "done":
180-
hide = True
178+
bypass = True
181179
if exp_result.status == "error" and (not include_errors):
182-
hide = True
180+
bypass = True
183181

184182
exp_args = exp_result.exp_args
185-
exp_args.is_dummy = hide # just to keep track
183+
exp_args.is_dummy = bypass # just to keep track
186184
exp_args.status = exp_result.status
187-
if hide:
185+
if bypass:
188186
# make those function do nothing since they are finished.
189187
exp_args.run = noop
190188
exp_args.prepare = noop
189+
else:
190+
if exp_args.exp_dir is not None:
191+
move_old_exp(exp_args.exp_dir)
192+
exp_args.exp_dir = None
191193

192194
return exp_args
193195

src/agentlab/experiments/study.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from agentlab.analyze import inspect_results
1717
from agentlab.experiments import reproducibility_util as repro
1818
from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
19-
from agentlab.experiments.launch_exp import find_incomplete, non_dummy_count, run_experiments
19+
from agentlab.experiments.launch_exp import prepare_study_for_relaunch, non_dummy_count, run_experiments
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -239,7 +239,7 @@ def find_incomplete(self, include_errors=True):
239239
list[ExpArgs]: The list of all experiments with completed ones replaced by a
240240
dummy exp_args to keep the task dependencies.
241241
"""
242-
self.exp_args_list = find_incomplete(self.dir, include_errors=include_errors)
242+
self.exp_args_list = prepare_study_for_relaunch(self.dir, include_errors=include_errors)
243243
n_incomplete = non_dummy_count(self.exp_args_list)
244244
n_error = [
245245
getattr(exp_args, "status", "incomplete") == "error" for exp_args in self.exp_args_list
@@ -276,19 +276,53 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
276276
)
277277
self.reproducibility_info = info
278278

279+
def save(self, exp_root=RESULTS_DIR):
280+
super().save(exp_root=exp_root)
281+
for exp_args in self.exp_args_list:
282+
exp_args.prepare(self.dir) # this will save the exp_arsg in their own directory
283+
279284
def run(
280285
self,
281286
n_jobs=1,
282287
parallel_backend="ray",
283288
strict_reproducibility=False,
284289
n_relaunch=3,
285290
relaunch_errors=True,
291+
exp_root=RESULTS_DIR,
286292
):
293+
"""Run the study.
294+
295+
Make sure the benchmarks are setup properly. See AgentLab's readme for more information.
296+
297+
Note: task hanging can be particularly annoying i.e playwright will loop indefinitely and
298+
nothing will happen. This will jam a worker and if no workers are available, the whole
299+
experiment will jam. We spent a lot of time debugging this, with some success but it still
300+
happens on some task. The ray backend will cancel the task after the specified timeout
301+
(defaults to 60s * max_step).
287302
303+
Args:
304+
n_jobs: int
305+
Number of parallel jobs.
306+
parallel_backend: str
307+
Parallel backend to use. Either "ray", "joblib", or "sequential". Note: joblib does
308+
not handle task dependencies. Also ray is the only one that can cancel tasks that
309+
are hanging.
310+
strict_reproducibility: bool
311+
If True, all modifications have to be committed before running the experiments.
312+
Also, if relaunching a study, it will not be possible if the code has changed.
313+
n_relaunch: int
314+
Number of times to relaunch the study if it has incomplete or errored experiments.
315+
(Visual)WebArena will have an instance reset before each evaluation.
316+
relaunch_errors: bool
317+
If False, relaunch only incomplete experiments and ignore errored ones.
318+
exp_root: Path
319+
The root directory where the study will be saved, defaults to AGENTLAB_EXP_ROOT env
320+
variable, whic defaults to $HOME/agentlab_results.
321+
"""
288322
self.set_reproducibility_info(
289323
strict_reproducibility=strict_reproducibility, comment=self.comment
290324
)
291-
self.save()
325+
self.save(exp_root)
292326

293327
n_exp = len(self.exp_args_list)
294328
last_error_count = None
@@ -377,6 +411,10 @@ def override_max_steps(self, max_steps):
377411

378412
@staticmethod
379413
def load(dir: Path) -> "Study":
414+
# TODO it's probably better to have a more intelligent way to load the study
415+
# * we should pop exp_args_list before saving and load from the individual directories
416+
# * when reloading, we should update the directory to reflect the actual ones in case it was moved
417+
# * same applies with sequential studies, i.e. it should pop the studies before saving and
380418
dir = Path(dir)
381419
study_path = dir / "study.pkl.gz"
382420
if not study_path.exists() and dir.is_dir():
@@ -443,18 +481,28 @@ def name(self):
443481
return _make_study_name(agent_names, benchmark_names, self.suffix)
444482

445483
def find_incomplete(self, include_errors=True):
484+
n_incomplete, n_error = 0, 0
446485
for study in self.studies:
447-
study.find_incomplete(include_errors=include_errors)
486+
n_inc, n_err = study.find_incomplete(include_errors=include_errors)
487+
n_incomplete += n_inc
488+
n_error += n_err
489+
return n_incomplete, n_error
448490

449-
def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
491+
def save(self, exp_root=RESULTS_DIR):
492+
# materialize the directory to have a place to store the individual studies
493+
self.make_dir(exp_root)
494+
for study in self.studies:
495+
study.save(exp_root=self.dir)
496+
# save the study object after the individual studies are materialized, to ensure these objects
497+
# have the proper study dir
498+
super().save(exp_root=exp_root)
499+
500+
def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3, exp_root=RESULTS_DIR):
450501

451502
# This sequence of of making directories is important to make sure objects are materialized
452503
# properly before saving. Otherwise relaunch may not work properly.
453-
self.make_dir()
454-
for study in self.studies:
455-
study.make_dir(exp_root=self.dir)
456-
457-
self.save()
504+
505+
self.save(exp_root)
458506

459507
for study in self.studies:
460508
study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch)
@@ -484,7 +532,7 @@ def get_most_recent_study(
484532
Returns:
485533
Path: The most recent folder satisfying the conditions
486534
"""
487-
535+
root_dir = Path(root_dir)
488536
if root_dir is None:
489537
root_dir = RESULTS_DIR
490538

tests/experiments/test_launch_exp.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
from contextlib import contextmanager
12
import math
3+
import shutil
24
import tempfile
35
from pathlib import Path
46

@@ -8,21 +10,41 @@
810
from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5, AGENT_4o_MINI
911
from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
1012
from agentlab.analyze import inspect_results
11-
from agentlab.experiments.launch_exp import find_incomplete, run_experiments, non_dummy_count
13+
from agentlab.experiments.launch_exp import prepare_study_for_relaunch, run_experiments, non_dummy_count
1214
from agentlab.experiments.study import Study
1315
from agentlab.llm.chat_api import CheatMiniWoBLLMArgs
1416

1517

18+
19+
@contextmanager
20+
def tmp_test_study():
21+
22+
with tempfile.TemporaryDirectory() as tmp_dir:
23+
source_study_dir = Path(__file__).parent.parent / "data" / "test_study"
24+
25+
# Create temporary study directory by copying the source
26+
tmp_study_dir = Path(tmp_dir) / "test_study"
27+
shutil.copytree(source_study_dir, tmp_study_dir)
28+
29+
try:
30+
yield tmp_study_dir
31+
finally:
32+
# The temporary directory will be automatically cleaned up
33+
# when exiting the context due to TemporaryDirectory
34+
pass
35+
36+
1637
def test_relaunch_study():
17-
study_dir = Path(__file__).parent.parent / "data" / "test_study"
18-
exp_args_list = find_incomplete(study_dir, include_errors=False)
38+
with tmp_test_study() as study_dir:
39+
exp_args_list = prepare_study_for_relaunch(study_dir, include_errors=False)
1940

20-
assert non_dummy_count(exp_args_list) == 1
21-
assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
41+
assert non_dummy_count(exp_args_list) == 1
42+
assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
2243

23-
exp_args_list = find_incomplete(study_dir, include_errors=True)
44+
with tmp_test_study() as study_dir:
45+
exp_args_list = prepare_study_for_relaunch(study_dir, include_errors=True)
2446

25-
assert non_dummy_count(exp_args_list) == 2
47+
assert non_dummy_count(exp_args_list) == 2
2648

2749

2850
def _test_launch_system(backend="ray", cause_timeout=False):
@@ -120,7 +142,8 @@ def test_4o_mini_on_miniwob_tiny_test():
120142

121143

122144
if __name__ == "__main__":
123-
test_timeout_ray()
145+
test_relaunch_study()
146+
# test_timeout_ray()
124147
# test_4o_mini_on_miniwob_tiny_test()
125148
# test_launch_system_ray()
126149
# test_launch_system_sequntial()

0 commit comments

Comments
 (0)