Reproducibility again (#49)

recursix · TLSDC · web-flow · commit adfcb8c25c46 · 2024-10-07T19:32:14.000+02:00
* core functions

* switch to dask

* removing joblib dependency and adding dask

* fixing imports

* handles multiple backends

* ensure asyncio loop creation

* more tests

* setting dashboard address to None

* minor

* Finally found a way to make it work

* initial reproducibility files

* Seems to be superflus

* adding a reproducibility journal

* minor update

* more robust

* adding reproducibility tools

* fix white listing

* minor

* minor

* minor

* minor

* minor fix

* more tests

* more results yay

* disabling this test

* update

* update

* black

* maybe fixing github workflow ?

* make get_git_username great again

* trigger change

* new browsergym

* GPT-4o result (and new comment column)

* Seems like there was a change to 4o flags, trying these

* minor comment

* better xray

* minor fix

* addming a comment field

* new agent

* another test with GPT-4o

* adding llama3 from openrouter

* fix naming

* unused import

* new summary tools and remove "_args" from columns in results

* add Llama

* initial code for reproducibility agent

* adjust inspect results

* infer from benchmark

* fix reproducibility agent

* prevent the repro_dir to be an index variable

* updating repro agent stats

* Reproducibility agent

* instructions to setup workarena

* fixing tests

* handles better a few edge cases

* default progress function to None

* minor formatting

* minor

* initial commit

* refactoring with Study class

* refactor to adapt for study class

* minor

* fix pricy test

* fixing tests

* tmp

* print report

* minor fix

* refine little details about reproducibility

* minor

* no need for set_temp anymore

* sanity check before running main

* minor update

* minor

* new results with 4o on workarena.l1

* sharing is caring

* add llama to main.py

* new hournal entry

* format

---------

Co-authored-by: Thibault Le Sellier de Chezelles &lt;thibault.de.chezelles@gmail.com&gt;
diff --git a/main.py b/main.py
@@ -2,22 +2,27 @@
 Note: This script is a convenience script to launch experiments instead of using
 the command line.
 
-Don't push your changes to this file to git unless you are making structural changes.
+Copy this script and modify at will, but don't push your changes to the
+repository.
 """
 
 import logging
 
-from agentlab.agents.generic_agent import AGENT_CUSTOM, RANDOM_SEARCH_AGENT, AGENT_4o, AGENT_4o_MINI
+from agentlab.agents.generic_agent import (
+    RANDOM_SEARCH_AGENT,
+    AGENT_4o,
+    AGENT_4o_MINI,
+    AGENT_LLAMA3_70B,
+    AGENT_LLAMA31_70B,
+)
 from agentlab.analyze.inspect_results import get_most_recent_folder
 from agentlab.experiments import study_generators
-from agentlab.experiments.exp_utils import RESULTS_DIR
 
 logging.getLogger().setLevel(logging.INFO)
 
 # choose your agent or provide a new agent
 agent_args = [AGENT_4o_MINI]
-# agent = AGENT_4o
-
+# agent_args = [AGENT_4o]
 
 ## select the benchmark to run on
 benchmark = "miniwob_tiny_test"
@@ -27,34 +32,34 @@
 # benchmark = "workarena.l3"
 # benchmark = "webarena"
 
+# Set reproducibility_mode = True for reproducibility
+# this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have
+# local changes. For your custom agents you need to implement set_reproducibility_mode
+reproducibility_mode = False
 
-## select the kind of experiment (study)
-## Or define new studies, you only have to return list of ExpArgs to run and a name for the study
-
-
-## alternatively, relaunch an existing study
-# study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
-# exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
-
+# Set relaunch = True to relaunch an existing study, this will continue incomplete
+# experiments and relaunch errored experiments
 relaunch = False
 
 ## Number of parallel jobs
-n_jobs = 1  # Make sure to use 1 job when debugging in VSCode
+n_jobs = 4  # Make sure to use 1 job when debugging in VSCode
 # n_jobs = -1  # to use all available cores
 
-# run the experiments
-if __name__ == "__main__":
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if reproducibility_mode:
+        [a.set_reproducibility_mode() for a in agent_args]
 
     if relaunch:
         #  relaunch an existing study
-        study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
+        study_dir = get_most_recent_folder()
         study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
 
     else:
         study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
 
-    study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=False)
+    study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
 
-    # Uncomment the following line if you think your study represent a
-    # reproducible result. You can run in relaunch mode to avoid re-running the experiments.
-    # study.append_to_journal(strict_reproducibility=True)
+    if reproducibility_mode:
+        study.append_to_journal(strict_reproducibility=True)
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
@@ -6,3 +6,6 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0
 recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
 recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
 recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
+recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140,  M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
diff --git a/src/agentlab/agents/agent_args.py b/src/agentlab/agents/agent_args.py
@@ -6,6 +6,11 @@ class AgentArgs(AbstractAgentArgs):
     def set_benchmark(self, benchmark: str, demo_mode: bool):
         """Optional method to set benchmark specific flags.
 
+        This allows the agent to have minor adjustments based on the benchmark.
+        E.g. using a benchmark specific action space. Or letting the agent see
+        HTML on MiniWoB since AXTree is not enough. Users should avoid making
+        extensive benchmark specific prompt engineering.
+
         Args:
             benchmark: str
                 Name of the benchmark.
@@ -14,3 +19,15 @@ def set_benchmark(self, benchmark: str, demo_mode: bool):
                 the demo_mode flag in the browsergym action space.
         """
         pass
+
+    def set_reproducibility_mode(self):
+        """Optional method to set the agent in a reproducibility mode.
+
+        This should adjust the agent configuration to make it as deterministic
+        as possible e.g. setting the temperature of the model to 0.
+
+        This is only called when reproducibility is requested.
+        """
+        raise NotImplementedError(
+            f"set_reproducibility_mode is not implemented for agent_args {self.__class__.__name__}"
+        )
diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py
@@ -2,6 +2,7 @@
     AGENT_3_5,
     AGENT_8B,
     AGENT_LLAMA3_70B,
+    AGENT_LLAMA31_70B,
     AGENT_CUSTOM,
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
@@ -15,6 +16,7 @@
     "AGENT_4o_MINI",
     "AGENT_4o_VISION",
     "AGENT_LLAMA3_70B",
+    "AGENT_LLAMA31_70B",
     "AGENT_8B",
     "RANDOM_SEARCH_AGENT",
     "AGENT_CUSTOM",
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -33,6 +33,9 @@ def set_benchmark(self, benchmark, demo_mode):
         if demo_mode:
             self.flags.action.demo_mode = "all_blue"
 
+    def set_reproducibility_mode(self):
+        self.chat_model_args.temperature = 0
+
     def prepare(self):
         return self.chat_model_args.prepare_server()
 
diff --git a/src/agentlab/agents/most_basic_agent/most_basic_agent.py b/src/agentlab/agents/most_basic_agent/most_basic_agent.py
@@ -1,44 +1,44 @@
 import logging
-import os
-import re
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, Any
 
-from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.experiments.agent import Agent, AgentInfo
-from browsergym.experiments.loop import AbstractAgentArgs, EnvArgs, ExpArgs
+import bgym
 
 from agentlab.llm.chat_api import make_system_message, make_user_message
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
 from agentlab.llm.tracking import cost_tracker_decorator
+from agentlab.agents.agent_args import AgentArgs
 
 if TYPE_CHECKING:
     from agentlab.llm.chat_api import BaseModelArgs
 
 
 @dataclass
-class MostBasicAgentArgs(AbstractAgentArgs):
+class MostBasicAgentArgs(AgentArgs):
     agent_name: str = "BasicAgent"
     temperature: float = 0.1
     use_chain_of_thought: bool = False
     chat_model_args: "BaseModelArgs" = None
 
-    def make_agent(self) -> Agent:
+    def make_agent(self) -> bgym.Agent:
         return MostBasicAgent(
             temperature=self.temperature,
             use_chain_of_thought=self.use_chain_of_thought,
             chat_model_args=self.chat_model_args,
         )
 
+    def set_reproducibility_mode(self):
+        self.temperature = 0
+
     def prepare(self):
         return self.chat_model_args.prepare_server()
 
     def close(self):
         return self.chat_model_args.close_server()
 
 
-class MostBasicAgent(Agent):
+class MostBasicAgent(bgym.Agent):
     def __init__(
         self, temperature: float, use_chain_of_thought: bool, chat_model_args: "BaseModelArgs"
     ):
@@ -47,7 +47,7 @@ def __init__(
         self.chat = chat_model_args.make_model()
         self.chat_model_args = chat_model_args
 
-        self.action_set = HighLevelActionSet(["bid"], multiaction=False)
+        self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
 
     @cost_tracker_decorator
     def get_action(self, obs: Any) -> tuple[str, dict]:
@@ -104,7 +104,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
 
         return (
             action,
-            AgentInfo(
+            bgym.AgentInfo(
                 think=thought,
                 chat_messages=messages,
                 # put any stats that you care about as long as it is a number or a dict of numbers
@@ -115,17 +115,19 @@ def parser(response: str) -> tuple[dict, bool, str]:
         )
 
 
-env_args = EnvArgs(
+# example for a single task
+env_args = bgym.EnvArgs(
     task_name="miniwob.click-button",
     task_seed=0,
     max_steps=10,
     headless=True,
 )
 
-chat_model_args = CHAT_MODEL_ARGS_DICT["azure/gpt-35-turbo/gpt-35-turbo"]
+chat_model_args = CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"]
 
+# example for 2 experiments testing chain of thoughts on a miniwob task
 exp_args = [
-    ExpArgs(
+    bgym.ExpArgs(
         agent_args=MostBasicAgentArgs(
             temperature=0.1,
             use_chain_of_thought=True,
@@ -134,7 +136,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
         env_args=env_args,
         logging_level=logging.INFO,
     ),
-    ExpArgs(
+    bgym.ExpArgs(
         agent_args=MostBasicAgentArgs(
             temperature=0.1,
             use_chain_of_thought=False,
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -481,7 +481,7 @@ def run_gradio(results_dir: Path):
         tabs.select(tab_select)
 
     demo.queue()
-    demo.launch(server_port=7899)
+    demo.launch(server_port=7899, share=True)
 
 
 def tab_select(evt: gr.SelectData):
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
@@ -167,7 +167,7 @@ def get_reproducibility_info(
     changes_white_list=(  # Files that are often modified during experiments but do not affect reproducibility
         "*/reproducibility_script.py",
         "*reproducibility_journal.csv",
-        "*/launch_command.py",
+        "*main.py",
     ),
     ignore_changes=False,
 ):
@@ -347,63 +347,6 @@ def _verify_report(report_df: pd.DataFrame, agent_names=list[str], strict_reprod
             )
     return report_df
 
-    # def add_reward(info, study_dir, ignore_incomplete=False):
-    #     """Add the average reward and standard error to the info dict.
-
-    #     Verifies that all tasks are completed and that there are no errors.
-    #     """
-    #     result_df = inspect_results.load_result_df(study_dir)
-    #     report = inspect_results.summarize_study(result_df)
-
-    #     if len(report) > 1:
-    #         raise ValueError("Multi agent not implemented yet")
-
-    #     if isinstance(info["agent_names"], (list, tuple)):
-    #         if len(info["agent_names"]) > 1:
-    #             raise ValueError("Multi agent not implemented yet")
-
-    #     idx = report.index[0]
-    #     n_err = report.loc[idx, "n_err"].item()
-    #     n_completed, n_total = report.loc[idx, "n_completed"].split("/")
-    #     if n_err > 0 and not ignore_incomplete:
-    #         raise ValueError(
-    #             f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
-    #         )
-    #     if n_completed != n_total and not ignore_incomplete:
-    #         raise ValueError(
-    #             f"Experiment has {n_completed} completed tasks out of {n_total}. "
-    #             f"Please rerun the study and make sure all tasks are completed."
-    #         )
-
-    #     for key in ("avg_reward", "std_err", "n_err", "n_completed"):
-    #         value = report.loc[idx, key]
-    #         if hasattr(value, "item"):
-    #             value = value.item()
-    #         info[key] = value
-
-    if isinstance(info["agent_name"], (list, tuple)):
-        if len(info["agent_name"]) > 1:
-            raise ValueError("Multi agent not implemented yet")
-
-    idx = report.index[0]
-    n_err = report.loc[idx, "n_err"].item()
-    n_completed, n_total = report.loc[idx, "n_completed"].split("/")
-    if n_err > 0 and not ignore_incomplete:
-        raise ValueError(
-            f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
-        )
-    if n_completed != n_total and not ignore_incomplete:
-        raise ValueError(
-            f"Experiment has {n_completed} completed tasks out of {n_total}. "
-            f"Please rerun the study and make sure all tasks are completed."
-        )
-
-    for key in ("avg_reward", "std_err", "n_err", "n_completed"):
-        value = report.loc[idx, key]
-        if hasattr(value, "item"):
-            value = value.item()
-        info[key] = value
-
 
 def _get_csv_headers(file_path: str) -> list[str]:
     with open(file_path, "r", newline="") as file:
@@ -464,10 +407,3 @@ def append_to_journal(
         writer = csv.writer(file)
         for row in rows:
             writer.writerow(row)
-
-
-def set_temp(agent_args: GenericAgentArgs, temperature=0):
-    """Set temperature to 0. Assumes a GenericAgent structure."""
-    agent_args = deepcopy(agent_args)
-    agent_args.chat_model_args.temperature = temperature
-    return agent_args
diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py
@@ -65,6 +65,9 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False)
         self.write_reproducibility_info(strict_reproducibility=strict_reproducibility)
 
         run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
+        report_df = self.get_report(ignore_cache=True)
+        logging.info(f"Study {self.name} finished.")
+        logging.info("\n" + str(report_df))
 
     def append_to_journal(self, strict_reproducibility=True):
         """Append the study to the journal.
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
@@ -1,5 +1,3 @@
-import os
-
 from agentlab.llm.chat_api import (
     AzureModelArgs,
     OpenAIModelArgs,
diff --git a/tests/experiments/test_reproducibility_util.py b/tests/experiments/test_reproducibility_util.py
@@ -8,11 +8,6 @@
 import json
 
 
-def test_set_temp():
-    agent_args = reproducibility_util.set_temp(AGENT_4o_MINI)
-    assert agent_args.chat_model_args.temperature == 0
-
-
 @pytest.mark.parametrize(
     "benchmark_name",
     ["miniwob", "workarena.l1", "webarena", "visualwebarena"],
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+import subprocess
+import pytest
+
+
+@pytest.mark.pricy
+def test_main_script_execution():
+    # this should trigger agent_4o_mini on miniwob_tiny_test unless this was
+    # reconfigured differently.
+    script_path = Path(__file__).parent.parent / "main.py"
+
+    # just make sure it's in the right state
+    main = __import__(script_path.stem)
+    assert main.benchmark == "miniwob_tiny_test"
+    assert main.reproducibility_mode == False
+    assert main.relaunch == False
+    assert main.n_jobs <= 10
+
+    result = subprocess.run(["python", script_path], capture_output=True, text=True, timeout=5 * 60)
+    assert result.returncode == 0
+
+
+if __name__ == "__main__":
+    test_main_script_execution()

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-import os`
`2`		`-`
`3`	`1`	`from agentlab.llm.chat_api import (`
`4`	`2`	`AzureModelArgs,`
`5`	`3`	`OpenAIModelArgs,`