first test for saving study differently still need to make it work

recursix · recursix · commit ce9dfafbae30 · 2024-12-04T13:25:45.000-05:00
diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from time import sleep, time
 
-from browsergym.experiments.loop import ExpArgs, _move_old_exp, yield_all_exp_results
+from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)  # Get logger based on module name
@@ -25,6 +25,12 @@
 RESULTS_DIR.mkdir(parents=True, exist_ok=True)
 
 
+def move_old_exp(exp_dir):
+    """Move the old experiment directory to a new name."""
+    exp_dir = Path(exp_dir)
+    if exp_dir.exists():
+        exp_dir.rename(exp_dir.with_name("_" + exp_dir.name))
+
 def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60):
     """Run exp_args.run() with a timeout and handle dependencies."""
     # episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout)
@@ -186,6 +192,6 @@ def hide_some_exp(base_dir, filter: callable, just_test):
     for exp in exp_list:
         if filter(exp):
             if not just_test:
-                _move_old_exp(exp.exp_dir)
+                move_old_exp(exp.exp_dir)
             filtered_out.append(exp)
     return filtered_out
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
@@ -5,7 +5,7 @@
 import bgym
 from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
 
-from agentlab.experiments.exp_utils import run_exp
+from agentlab.experiments.exp_utils import run_exp, move_old_exp
 
 
 def run_experiments(
@@ -70,15 +70,6 @@ def run_experiments(
                 for exp_args in exp_args_list
             )
 
-        # dask will be deprecated, as there was issues. use ray instead
-        # elif parallel_backend == "dask":
-        #     from agentlab.experiments.graph_execution_dask import (
-        #         execute_task_graph,
-        #         make_dask_client,
-        #     )
-
-        #     with make_dask_client(n_worker=n_jobs):
-        #         execute_task_graph(exp_args_list)
         elif parallel_backend == "ray":
             from agentlab.experiments.graph_execution_ray import execute_task_graph, ray
 
@@ -101,7 +92,7 @@ def run_experiments(
         logging.info("Experiment finished.")
 
 
-def find_incomplete(study_dir: str | Path, include_errors=True):
+def prepare_study_for_relaunch(study_dir: str | Path, include_errors=True):
     """Find all incomplete experiments for relaunching.
 
     Note: completed experiments are kept but are replaced by dummy exp_args
@@ -130,7 +121,7 @@ def find_incomplete(study_dir: str | Path, include_errors=True):
         )
 
     exp_result_list = list(yield_all_exp_results(study_dir, progress_fn=None))
-    exp_args_list = [_hide_completed(exp_result, include_errors) for exp_result in exp_result_list]
+    exp_args_list = [prepare_exp_for_relaunch(exp_result, include_errors) for exp_result in exp_result_list]
     # sort according to exp_args.order
     exp_args_list.sort(key=lambda exp_args: exp_args.order if exp_args.order is not None else 0)
 
@@ -158,11 +149,18 @@ def noop(*args, **kwargs):
     pass
 
 
-def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
-    """Hide completed experiments from the list.
-
-    This little hack, allows an elegant way to keep the task dependencies for e.g. webarena
-    while skipping the tasks that are completed when relaunching.
+def prepare_exp_for_relaunch(exp_result: bgym.ExpResult, include_errors: bool = True):
+    """Prepare an experiment for relaunching.
+    
+    Based on the status, determine if it needs to be relaunched. 
+    if relaunch:
+        move old exp_dir to _{exp_dir}
+    if bypass:
+        keep the exp_args in the list for the task dependencies but make it a dummy that will just
+        execute nothing. 
+        
+    This bypass hack, allows an elegant way to keep the task dependencies for e.g. webarena while
+    skipping the tasks that are completed when relaunching.
 
     Args:
         exp_result: bgym.ExpResult
@@ -175,19 +173,23 @@ def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
             The ExpArgs object hidden if the experiment is completed.
     """
 
-    hide = False
+    bypass = False
     if exp_result.status == "done":
-        hide = True
+        bypass = True
     if exp_result.status == "error" and (not include_errors):
-        hide = True
+        bypass = True
 
     exp_args = exp_result.exp_args
-    exp_args.is_dummy = hide  # just to keep track
+    exp_args.is_dummy = bypass  # just to keep track
     exp_args.status = exp_result.status
-    if hide:
+    if bypass:
         # make those function do nothing since they are finished.
         exp_args.run = noop
         exp_args.prepare = noop
+    else:
+        if exp_args.exp_dir is not None:
+            move_old_exp(exp_args.exp_dir)
+            exp_args.exp_dir = None
 
     return exp_args
 
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
@@ -16,7 +16,7 @@
 from agentlab.analyze import inspect_results
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
-from agentlab.experiments.launch_exp import find_incomplete, non_dummy_count, run_experiments
+from agentlab.experiments.launch_exp import prepare_study_for_relaunch, non_dummy_count, run_experiments
 
 logger = logging.getLogger(__name__)
 
@@ -239,7 +239,7 @@ def find_incomplete(self, include_errors=True):
             list[ExpArgs]: The list of all experiments with completed ones replaced by a
                 dummy exp_args to keep the task dependencies.
         """
-        self.exp_args_list = find_incomplete(self.dir, include_errors=include_errors)
+        self.exp_args_list = prepare_study_for_relaunch(self.dir, include_errors=include_errors)
         n_incomplete = non_dummy_count(self.exp_args_list)
         n_error = [
             getattr(exp_args, "status", "incomplete") == "error" for exp_args in self.exp_args_list
@@ -276,19 +276,53 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
             )
         self.reproducibility_info = info
 
+    def save(self, exp_root=RESULTS_DIR):
+        super().save(exp_root=exp_root)
+        for exp_args in self.exp_args_list: 
+            exp_args.prepare(self.dir) # this will save the exp_arsg in their own directory
+
     def run(
         self,
         n_jobs=1,
         parallel_backend="ray",
         strict_reproducibility=False,
         n_relaunch=3,
         relaunch_errors=True,
+        exp_root=RESULTS_DIR,
     ):
+        """Run the study.
+        
+        Make sure the benchmarks are setup properly. See AgentLab's readme for more information.
+
+        Note: task hanging can be particularly annoying i.e playwright will loop indefinitely and
+        nothing will happen. This will jam a worker and if no workers are available, the whole
+        experiment will jam. We spent a lot of time debugging this, with some success but it still
+        happens on some task. The ray backend will cancel the task after the specified timeout
+        (defaults to 60s * max_step).
 
+        Args:
+            n_jobs: int
+                Number of parallel jobs.
+            parallel_backend: str
+                Parallel backend to use. Either "ray", "joblib", or "sequential". Note: joblib does
+                not handle task dependencies. Also ray is the only one that can cancel tasks that
+                are hanging.
+            strict_reproducibility: bool
+                If True, all modifications have to be committed before running the experiments.
+                Also, if relaunching a study, it will not be possible if the code has changed.
+            n_relaunch: int
+                Number of times to relaunch the study if it has incomplete or errored experiments.
+                (Visual)WebArena will have an instance reset before each evaluation.
+            relaunch_errors: bool
+                If False, relaunch only incomplete experiments and ignore errored ones.
+            exp_root: Path
+                The root directory where the study will be saved, defaults to AGENTLAB_EXP_ROOT env
+                variable, whic defaults to $HOME/agentlab_results.
+        """
         self.set_reproducibility_info(
             strict_reproducibility=strict_reproducibility, comment=self.comment
         )
-        self.save()
+        self.save(exp_root)
 
         n_exp = len(self.exp_args_list)
         last_error_count = None
@@ -377,6 +411,10 @@ def override_max_steps(self, max_steps):
 
     @staticmethod
     def load(dir: Path) -> "Study":
+        # TODO it's probably better to have a more intelligent way to load the study
+        # * we should pop exp_args_list before saving and load from the individual directories
+        # * when reloading, we should update the directory to reflect the actual ones in case it was moved
+        # * same applies with sequential studies, i.e. it should pop the studies before saving and
         dir = Path(dir)
         study_path = dir / "study.pkl.gz"
         if not study_path.exists() and dir.is_dir():
@@ -443,18 +481,28 @@ def name(self):
         return _make_study_name(agent_names, benchmark_names, self.suffix)
 
     def find_incomplete(self, include_errors=True):
+        n_incomplete, n_error = 0, 0
         for study in self.studies:
-            study.find_incomplete(include_errors=include_errors)
+            n_inc, n_err = study.find_incomplete(include_errors=include_errors)
+            n_incomplete += n_inc
+            n_error += n_err
+        return n_incomplete, n_error
 
-    def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3):
+    def save(self, exp_root=RESULTS_DIR):
+        # materialize the directory to have a place to store the individual studies
+        self.make_dir(exp_root)
+        for study in self.studies:
+            study.save(exp_root=self.dir)
+        # save the study object after the individual studies are materialized, to ensure these objects
+        # have the proper study dir
+        super().save(exp_root=exp_root) 
+
+    def run(self, n_jobs=1, parallel_backend="ray", strict_reproducibility=False, n_relaunch=3, exp_root=RESULTS_DIR):
 
         # This sequence of of making directories is important to make sure objects are materialized
         # properly before saving. Otherwise relaunch may not work properly.
-        self.make_dir()
-        for study in self.studies:
-            study.make_dir(exp_root=self.dir)
-
-        self.save()
+        
+        self.save(exp_root)
 
         for study in self.studies:
             study.run(n_jobs, parallel_backend, strict_reproducibility, n_relaunch)
@@ -484,7 +532,7 @@ def get_most_recent_study(
     Returns:
         Path: The most recent folder satisfying the conditions
     """
-
+    root_dir = Path(root_dir)
     if root_dir is None:
         root_dir = RESULTS_DIR
 
diff --git a/tests/experiments/test_launch_exp.py b/tests/experiments/test_launch_exp.py
@@ -1,4 +1,6 @@
+from contextlib import contextmanager
 import math
+import shutil
 import tempfile
 from pathlib import Path
 
@@ -8,21 +10,41 @@
 from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_3_5, AGENT_4o_MINI
 from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 from agentlab.analyze import inspect_results
-from agentlab.experiments.launch_exp import find_incomplete, run_experiments, non_dummy_count
+from agentlab.experiments.launch_exp import prepare_study_for_relaunch, run_experiments, non_dummy_count
 from agentlab.experiments.study import Study
 from agentlab.llm.chat_api import CheatMiniWoBLLMArgs
 
 
+
+@contextmanager
+def tmp_test_study():
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        source_study_dir = Path(__file__).parent.parent / "data" / "test_study"
+        
+        # Create temporary study directory by copying the source
+        tmp_study_dir = Path(tmp_dir) / "test_study"
+        shutil.copytree(source_study_dir, tmp_study_dir)
+        
+        try:
+            yield tmp_study_dir
+        finally:
+            # The temporary directory will be automatically cleaned up
+            # when exiting the context due to TemporaryDirectory
+            pass
+
+
 def test_relaunch_study():
-    study_dir = Path(__file__).parent.parent / "data" / "test_study"
-    exp_args_list = find_incomplete(study_dir, include_errors=False)
+    with tmp_test_study() as study_dir:
+        exp_args_list = prepare_study_for_relaunch(study_dir, include_errors=False)
 
-    assert non_dummy_count(exp_args_list) == 1
-    assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
+        assert non_dummy_count(exp_args_list) == 1
+        assert exp_args_list[0].env_args.task_name == "miniwob.ascending-numbers"
 
-    exp_args_list = find_incomplete(study_dir, include_errors=True)
+    with tmp_test_study() as study_dir:
+        exp_args_list = prepare_study_for_relaunch(study_dir, include_errors=True)
 
-    assert non_dummy_count(exp_args_list) == 2
+        assert non_dummy_count(exp_args_list) == 2
 
 
 def _test_launch_system(backend="ray", cause_timeout=False):
@@ -120,7 +142,8 @@ def test_4o_mini_on_miniwob_tiny_test():
 
 
 if __name__ == "__main__":
-    test_timeout_ray()
+    test_relaunch_study()
+    # test_timeout_ray()
     # test_4o_mini_on_miniwob_tiny_test()
     # test_launch_system_ray()
     # test_launch_system_sequntial()
diff --git a/tests/experiments/test_study.py b/tests/experiments/test_study.py