Clean pipeline (#117)

recursix · gasse · web-flow · commit 16e752629700 · 2024-11-13T11:59:34.000-05:00
* yet another way to kill timedout jobs * Improve timeout handling in task polling logic * Add method to override max_steps in Study class * add support for tab visibility in observation flags and update related components * fix tests * black * Improve timeout handling in task polling logic * yet another way to kill timedout jobs (#108) * Add method to override max_steps in Study class * add support for tab visibility in observation flags and update related components * fix tests * black * black * Fix sorting bug. improve directory content retrieval with summary statistics * fix test * black * tmp * add error report, add cum cost to summary and ray backend by default * black * fix test (chaing to joblib backend) * black --------- Co-authored-by: Maxime Gasse <maxime.gasse@gmail.com>
diff --git a/src/agentlab/agents/dynamic_prompting.py b/src/agentlab/agents/dynamic_prompting.py
@@ -10,12 +10,7 @@
 
 import bgym
 from browsergym.core.action.base import AbstractActionSet
-from browsergym.utils.obs import (
-    flatten_axtree_to_str,
-    flatten_dom_to_str,
-    overlay_som,
-    prune_html,
-)
+from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, overlay_som, prune_html
 
 from agentlab.llm.llm_utils import (
     BaseMessage,
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -142,6 +142,10 @@ def filter_agent_id(self, agent_id: list[tuple]):
     max-height: 400px;
     overflow-y: auto;
 }
+.error-report {
+    max-height: 700px;
+    overflow-y: auto;
+}
 .my-code-view {
     max-height: 300px;
     overflow-y: auto;
@@ -284,6 +288,8 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Global Stats"):
                 global_stats = gr.DataFrame(max_height=500, show_label=False, interactive=False)
 
+            with gr.Tab("Error Report"):
+                error_report = gr.Markdown(elem_classes="error-report", show_copy_button=True)
         with gr.Row():
             episode_info = gr.Markdown(label="Episode Info", elem_classes="my-markdown")
             action_info = gr.Markdown(label="Action Info", elem_classes="my-markdown")
@@ -411,7 +417,7 @@ def run_gradio(results_dir: Path):
         exp_dir_choice.change(
             fn=new_exp_dir,
             inputs=exp_dir_choice,
-            outputs=[agent_table, agent_id, constants, variables, global_stats],
+            outputs=[agent_table, agent_id, constants, variables, global_stats, error_report],
         )
 
         agent_table.select(fn=on_select_agent, inputs=agent_table, outputs=[agent_id])
@@ -918,19 +924,25 @@ def get_agent_report(result_df: pd.DataFrame):
 
 
 def update_global_stats():
-    global info
     stats = inspect_results.global_report(info.result_df, reduce_fn=inspect_results.summarize_stats)
     stats.reset_index(inplace=True)
     return stats
 
 
+def update_error_report():
+    report_files = list(info.exp_list_dir.glob("error_report*.md"))
+    if len(report_files) == 0:
+        return "No error report found"
+    report_files = sorted(report_files, key=os.path.getctime, reverse=True)
+    return report_files[0].read_text()
+
+
 def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
 
     if exp_dir == select_dir_instructions:
         return None, None
 
     exp_dir = exp_dir.split(" - ")[0]
-    global info
 
     if len(exp_dir) == 0:
         info.exp_list_dir = None
@@ -951,7 +963,14 @@ def new_exp_dir(exp_dir, progress=gr.Progress(), just_refresh=False):
     agent_id = info.get_agent_id(agent_report.iloc[0])
 
     constants, variables = format_constant_and_variables()
-    return agent_report, agent_id, constants, variables, update_global_stats()
+    return (
+        agent_report,
+        agent_id,
+        constants,
+        variables,
+        update_global_stats(),
+        update_error_report(),
+    )
 
 
 def new_agent_id(agent_id: list[tuple]):
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):
             n_err=err.sum(skipna=True),
         )
         if "stats.cum_cost" in sub_df:
-            record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)
+            record["cum_cost"] = sub_df["stats.cum_cost"].sum(skipna=True).round(4)
 
     return pd.Series(record)
 
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
@@ -40,9 +40,9 @@ def run_experiments(
     study_dir = Path(study_dir)
     study_dir.mkdir(parents=True, exist_ok=True)
 
-    if n_jobs == 1 and parallel_backend != "sequential":
-        logging.warning("Only 1 job, switching to sequential backend.")
-        parallel_backend = "sequential"
+    # if n_jobs == 1 and parallel_backend != "sequential":
+    #     logging.warning("Only 1 job, switching to sequential backend.")
+    #     parallel_backend = "sequential"
 
     logging.info(f"Saving experiments to {study_dir}")
     for exp_args in exp_args_list:
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
@@ -123,7 +123,7 @@ def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
     def run(
         self,
         n_jobs=1,
-        parallel_backend="joblib",
+        parallel_backend="ray",
         strict_reproducibility=False,
         n_relaunch=3,
         relaunch_errors=True,
diff --git a/tests/agents/test_agent.py b/tests/agents/test_agent.py
@@ -25,7 +25,9 @@ def test_generic_agent():
 
     with tempfile.TemporaryDirectory() as tmp_dir:
 
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
 
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
@@ -144,9 +146,12 @@ def test_generic_agent_parse_retry():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        # TODO why these tests don't work with ray backend?
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
-
+        print(result_record)
         target = {
             "stats.cum_n_retry": 2,
             "stats.cum_busted_retry": 0,
@@ -169,7 +174,9 @@ def test_bust_parse_retry():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -195,7 +202,9 @@ def test_llm_error_success():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -220,7 +229,9 @@ def test_llm_error_no_success():
     )
 
     with tempfile.TemporaryDirectory() as tmp_dir:
-        launch_exp.run_experiments(1, [exp_args], Path(tmp_dir) / "generic_agent_test")
+        launch_exp.run_experiments(
+            1, [exp_args], Path(tmp_dir) / "generic_agent_test", parallel_backend="joblib"
+        )
         result_record = inspect_results.load_result_df(tmp_dir, progress_fn=None)
 
         target = {
@@ -236,4 +247,4 @@ def test_llm_error_no_success():
 
 if __name__ == "__main__":
     # test_generic_agent()
-    test_llm_error_success()
+    test_generic_agent_parse_retry()

Original file line number	Diff line number	Diff line change
`@@ -297,7 +297,7 @@ def summarize(sub_df, use_bootstrap=False):`
`297`	`297`	`n_err=err.sum(skipna=True),`
`298`	`298`	`)`
`299`	`299`	`if "stats.cum_cost" in sub_df:`
`300`		`- record["cum_cost"] = (sub_df["stats.cum_cost"].sum(skipna=True).round(4),)`
	`300`	`+ record["cum_cost"] = sub_df["stats.cum_cost"].sum(skipna=True).round(4)`
`301`	`301`
`302`	`302`	`return pd.Series(record)`
`303`	`303`