|
16 | 16 | from agentlab.analyze import inspect_results |
17 | 17 | from agentlab.experiments import reproducibility_util as repro |
18 | 18 | from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies |
19 | | -from agentlab.experiments.launch_exp import find_incomplete, non_dummy_count, run_experiments |
| 19 | +from agentlab.experiments.launch_exp import prepare_study_for_relaunch, non_dummy_count, run_experiments |
20 | 20 |
|
21 | 21 | logger = logging.getLogger(__name__) |
22 | 22 |
|
@@ -239,7 +239,7 @@ def find_incomplete(self, include_errors=True): |
239 | 239 | list[ExpArgs]: The list of all experiments with completed ones replaced by a |
240 | 240 | dummy exp_args to keep the task dependencies. |
241 | 241 | """ |
242 | | - self.exp_args_list = find_incomplete(self.dir, include_errors=include_errors) |
| 242 | + self.exp_args_list = prepare_study_for_relaunch(self.dir, include_errors=include_errors) |
243 | 243 | n_incomplete = non_dummy_count(self.exp_args_list) |
244 | 244 | n_error = [ |
245 | 245 | getattr(exp_args, "status", "incomplete") == "error" for exp_args in self.exp_args_list |
@@ -276,19 +276,53 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None): |
276 | 276 | ) |
277 | 277 | self.reproducibility_info = info |
278 | 278 |
|
| 279 | + def save(self, exp_root=RESULTS_DIR): |
| 280 | + super().save(exp_root=exp_root) |
| 281 | + for exp_args in self.exp_args_list: |
| 282 | + exp_args.prepare(self.dir) # this will save the exp_arsg in their own directory |
| 283 | + |
279 | 284 | def run( |
280 | 285 | self, |
281 | 286 | n_jobs=1, |
282 | 287 | parallel_backend="ray", |
283 | 288 | strict_reproducibility=False, |
284 | 289 | n_relaunch=3, |
285 | 290 | relaunch_errors=True, |
| 291 | + exp_root=RESULTS_DIR, |
286 | 292 | ): |
| 293 | + """Run the study. |
| 294 | + |
| 295 | + Make sure the benchmarks are setup properly. See AgentLab's readme for more information. |
| 296 | +
|
| 297 | + Note: task hanging can be particularly annoying i.e playwright will loop indefinitely and |
| 298 | + nothing will happen. This will jam a worker and if no workers are available, the whole |
| 299 | + experiment will jam. We spent a lot of time debugging this, with some success but it still |
| 300 | + happens on some task. The ray backend will cancel the task after the specified timeout |
| 301 | + (defaults to 60s * max_step). |
287 | 302 |
|
| 303 | + Args: |
| 304 | + n_jobs: int |
| 305 | + Number of parallel jobs. |
| 306 | + parallel_backend: str |
| 307 | + Parallel backend to use. Either "ray", "joblib", or "sequential". Note: joblib does |
| 308 | + not handle task dependencies. Also ray is the only one that can cancel tasks that |
| 309 | + are hanging. |
| 310 | + strict_reproducibility: bool |
| 311 | + If True, all modifications have to be committed before running the experiments. |
| 312 | + Also, if relaunching a study, it will not be possible if the code has changed. |
| 313 | + n_relaunch: int |
| 314 | + Number of times to relaunch the study if it has incomplete or errored experiments. |
| 315 | + (Visual)WebArena will have an instance reset before each evaluation. |
| 316 | + relaunch_errors: bool |
| 317 | + If False, relaunch only incomplete experiments and ignore errored ones. |
| 318 | + exp_root: Path |
| 319 | + The root directory where the study will be saved, defaults to AGENTLAB_EXP_ROOT env |
| 320 | + variable, whic defaults to $HOME/agentlab_results. |
| 321 | + """ |
288 | 322 | self.set_reproducibility_info( |
289 | 323 | strict_reproducibility=strict_reproducibility, comment=self.comment |
290 | 324 | ) |
291 | | - self.save() |
| 325 | + self.save(exp_root) |
292 | 326 |
|
293 | 327 | n_exp = len(self.exp_args_list) |
294 | 328 | last_error_count = None |
@@ -377,6 +411,10 @@ def override_max_steps(self, max_steps): |
377 | 411 |
|
378 | 412 | @staticmethod |
379 | 413 | def load(dir: Path) -> "Study": |
| 414 | + # TODO it's probably better to have a more intelligent way to load the study |
| 415 | + # * we should pop exp_args_list before saving and load from the individual directories |
| 416 | + # * when reloading, we should update the directory to reflect the actual ones in case it was moved |
| 417 | + # * same applies with sequential studies, i.e. it should pop the studies before saving and |
380 | 418 | dir = Path(dir) |
381 | 419 | study_path = dir / "study.pkl.gz" |
382 | 420 | if not study_path.exists() and dir.is_dir(): |
@@ -443,18 +481,28 @@ def name(self): |
443 | 481 | return _make_study_name(agent_names, benchmark_names, self.suffix) |
444 | 482 |
|
445 | 483 | def find_incomplete(self, include_errors=True): |
| 484 | + n_incomplete, n_error = 0, 0 |
446 | 485 | for study in self.studies: |
447 | | - study.find_incomplete(include_errors=include_errors) |
| 486 | + n_inc, n_err = study.find_incomplete(include_errors=include_errors) |
| 487 | + n_incomplete += n_inc |
| 488 | + n_error += n_err |
| 489 | + return n_incomplete, n_error |
448 | 490 |
|
449 | | - def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3): |
| 491 | + def save(self, exp_root=RESULTS_DIR): |
| 492 | + # materialize the directory to have a place to store the individual studies |
| 493 | + self.make_dir(exp_root) |
| 494 | + for study in self.studies: |
| 495 | + study.save(exp_root=self.dir) |
| 496 | + # save the study object after the individual studies are materialized, to ensure these objects |
| 497 | + # have the proper study dir |
| 498 | + super().save(exp_root=exp_root) |
| 499 | + |
| 500 | + def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3, exp_root=RESULTS_DIR): |
450 | 501 |
|
451 | 502 | # This sequence of of making directories is important to make sure objects are materialized |
452 | 503 | # properly before saving. Otherwise relaunch may not work properly. |
453 | | - self.make_dir() |
454 | | - for study in self.studies: |
455 | | - study.make_dir(exp_root=self.dir) |
456 | | - |
457 | | - self.save() |
| 504 | + |
| 505 | + self.save(exp_root) |
458 | 506 |
|
459 | 507 | for study in self.studies: |
460 | 508 | study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch) |
@@ -484,7 +532,7 @@ def get_most_recent_study( |
484 | 532 | Returns: |
485 | 533 | Path: The most recent folder satisfying the conditions |
486 | 534 | """ |
487 | | - |
| 535 | + root_dir = Path(root_dir) |
488 | 536 | if root_dir is None: |
489 | 537 | root_dir = RESULTS_DIR |
490 | 538 |
|
|
0 commit comments