Skip to content

Commit adfcb8c

Browse files
recursixTLSDC
andauthored
Reproducibility again (#49)
* core functions * switch to dask * removing joblib dependency and adding dask * fixing imports * handles multiple backends * ensure asyncio loop creation * more tests * setting dashboard address to None * minor * Finally found a way to make it work * initial reproducibility files * Seems to be superflus * adding a reproducibility journal * minor update * more robust * adding reproducibility tools * fix white listing * minor * minor * minor * minor * minor fix * more tests * more results yay * disabling this test * update * update * black * maybe fixing github workflow ? * make get_git_username great again * trigger change * new browsergym * GPT-4o result (and new comment column) * Seems like there was a change to 4o flags, trying these * minor comment * better xray * minor fix * addming a comment field * new agent * another test with GPT-4o * adding llama3 from openrouter * fix naming * unused import * new summary tools and remove "_args" from columns in results * add Llama * initial code for reproducibility agent * adjust inspect results * infer from benchmark * fix reproducibility agent * prevent the repro_dir to be an index variable * updating repro agent stats * Reproducibility agent * instructions to setup workarena * fixing tests * handles better a few edge cases * default progress function to None * minor formatting * minor * initial commit * refactoring with Study class * refactor to adapt for study class * minor * fix pricy test * fixing tests * tmp * print report * minor fix * refine little details about reproducibility * minor * no need for set_temp anymore * sanity check before running main * minor update * minor * new results with 4o on workarena.l1 * sharing is caring * add llama to main.py * new hournal entry * format --------- Co-authored-by: Thibault Le Sellier de Chezelles <[email protected]>
1 parent f30064a commit adfcb8c

File tree

12 files changed

+96
-108
lines changed

12 files changed

+96
-108
lines changed

main.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,27 @@
22
Note: This script is a convenience script to launch experiments instead of using
33
the command line.
44
5-
Don't push your changes to this file to git unless you are making structural changes.
5+
Copy this script and modify at will, but don't push your changes to the
6+
repository.
67
"""
78

89
import logging
910

10-
from agentlab.agents.generic_agent import AGENT_CUSTOM, RANDOM_SEARCH_AGENT, AGENT_4o, AGENT_4o_MINI
11+
from agentlab.agents.generic_agent import (
12+
RANDOM_SEARCH_AGENT,
13+
AGENT_4o,
14+
AGENT_4o_MINI,
15+
AGENT_LLAMA3_70B,
16+
AGENT_LLAMA31_70B,
17+
)
1118
from agentlab.analyze.inspect_results import get_most_recent_folder
1219
from agentlab.experiments import study_generators
13-
from agentlab.experiments.exp_utils import RESULTS_DIR
1420

1521
logging.getLogger().setLevel(logging.INFO)
1622

1723
# choose your agent or provide a new agent
1824
agent_args = [AGENT_4o_MINI]
19-
# agent = AGENT_4o
20-
25+
# agent_args = [AGENT_4o]
2126

2227
## select the benchmark to run on
2328
benchmark = "miniwob_tiny_test"
@@ -27,34 +32,34 @@
2732
# benchmark = "workarena.l3"
2833
# benchmark = "webarena"
2934

35+
# Set reproducibility_mode = True for reproducibility
36+
# this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have
37+
# local changes. For your custom agents you need to implement set_reproducibility_mode
38+
reproducibility_mode = False
3039

31-
## select the kind of experiment (study)
32-
## Or define new studies, you only have to return list of ExpArgs to run and a name for the study
33-
34-
35-
## alternatively, relaunch an existing study
36-
# study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
37-
# exp_args_list, study_dir = relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
38-
40+
# Set relaunch = True to relaunch an existing study, this will continue incomplete
41+
# experiments and relaunch errored experiments
3942
relaunch = False
4043

4144
## Number of parallel jobs
42-
n_jobs = 1 # Make sure to use 1 job when debugging in VSCode
45+
n_jobs = 4 # Make sure to use 1 job when debugging in VSCode
4346
# n_jobs = -1 # to use all available cores
4447

45-
# run the experiments
46-
if __name__ == "__main__":
48+
49+
if __name__ == "__main__": # necessary for dask backend
50+
51+
if reproducibility_mode:
52+
[a.set_reproducibility_mode() for a in agent_args]
4753

4854
if relaunch:
4955
# relaunch an existing study
50-
study_dir = get_most_recent_folder(RESULTS_DIR, contains=None)
56+
study_dir = get_most_recent_folder()
5157
study = study_generators.make_relaunch_study(study_dir, relaunch_mode="incomplete_or_error")
5258

5359
else:
5460
study = study_generators.run_agents_on_benchmark(agent_args, benchmark)
5561

56-
study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=False)
62+
study.run(n_jobs=n_jobs, parallel_backend="joblib", strict_reproducibility=reproducibility_mode)
5763

58-
# Uncomment the following line if you think your study represent a
59-
# reproducible result. You can run in relaunch mode to avoid re-running the experiments.
60-
# study.append_to_journal(strict_reproducibility=True)
64+
if reproducibility_mode:
65+
study.append_to_journal(strict_reproducibility=True)

reproducibility_journal.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-09-20_07-16-21,0
66
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-20_22-09-43,0.656,0.019,0,625/625,,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,f6216486d5faac2c8b3fb0a63e114e5a4bafde47,,0.6.4,8cef8fe34940ff490d0cc06b0c8f100180d09d43,
77
recursix,GenericAgent-gpt-4o-2024-05-13,miniwob,0.6.3,2024-09-21_12-04-39,0.656,0.019,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe561b93c5f053e9f9625358862f542523b5e14a,,0.7.0,ed6d6992ef64bfb91aca7002d33cb6ed5ec031ef,
88
recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0.539,0.02,0,625/625,None,Darwin (Darwin Kernel Version 23.6.0: Mon Jul 29 21:13:00 PDT 2024; root:xnu-10063.141.2~1/RELEASE_X86_64),3.12.2,1.39.0,0.2.1,fe27819a99b163fd9240ba3e144e010413bff24d,,0.7.1,b0ad675572e01cac0d7255100112de0828877148,
9+
recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140, M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
10+
recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
11+
recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,

src/agentlab/agents/agent_args.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ class AgentArgs(AbstractAgentArgs):
66
def set_benchmark(self, benchmark: str, demo_mode: bool):
77
"""Optional method to set benchmark specific flags.
88
9+
This allows the agent to have minor adjustments based on the benchmark.
10+
E.g. using a benchmark specific action space. Or letting the agent see
11+
HTML on MiniWoB since AXTree is not enough. Users should avoid making
12+
extensive benchmark specific prompt engineering.
13+
914
Args:
1015
benchmark: str
1116
Name of the benchmark.
@@ -14,3 +19,15 @@ def set_benchmark(self, benchmark: str, demo_mode: bool):
1419
the demo_mode flag in the browsergym action space.
1520
"""
1621
pass
22+
23+
def set_reproducibility_mode(self):
24+
"""Optional method to set the agent in a reproducibility mode.
25+
26+
This should adjust the agent configuration to make it as deterministic
27+
as possible e.g. setting the temperature of the model to 0.
28+
29+
This is only called when reproducibility is requested.
30+
"""
31+
raise NotImplementedError(
32+
f"set_reproducibility_mode is not implemented for agent_args {self.__class__.__name__}"
33+
)

src/agentlab/agents/generic_agent/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
AGENT_3_5,
33
AGENT_8B,
44
AGENT_LLAMA3_70B,
5+
AGENT_LLAMA31_70B,
56
AGENT_CUSTOM,
67
RANDOM_SEARCH_AGENT,
78
AGENT_4o,
@@ -15,6 +16,7 @@
1516
"AGENT_4o_MINI",
1617
"AGENT_4o_VISION",
1718
"AGENT_LLAMA3_70B",
19+
"AGENT_LLAMA31_70B",
1820
"AGENT_8B",
1921
"RANDOM_SEARCH_AGENT",
2022
"AGENT_CUSTOM",

src/agentlab/agents/generic_agent/generic_agent.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ def set_benchmark(self, benchmark, demo_mode):
3333
if demo_mode:
3434
self.flags.action.demo_mode = "all_blue"
3535

36+
def set_reproducibility_mode(self):
37+
self.chat_model_args.temperature = 0
38+
3639
def prepare(self):
3740
return self.chat_model_args.prepare_server()
3841

src/agentlab/agents/most_basic_agent/most_basic_agent.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,44 @@
11
import logging
2-
import os
3-
import re
42
from dataclasses import asdict, dataclass
53
from typing import TYPE_CHECKING, Any
64

7-
from browsergym.core.action.highlevel import HighLevelActionSet
8-
from browsergym.experiments.agent import Agent, AgentInfo
9-
from browsergym.experiments.loop import AbstractAgentArgs, EnvArgs, ExpArgs
5+
import bgym
106

117
from agentlab.llm.chat_api import make_system_message, make_user_message
128
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
139
from agentlab.llm.llm_utils import ParseError, extract_code_blocks, retry
1410
from agentlab.llm.tracking import cost_tracker_decorator
11+
from agentlab.agents.agent_args import AgentArgs
1512

1613
if TYPE_CHECKING:
1714
from agentlab.llm.chat_api import BaseModelArgs
1815

1916

2017
@dataclass
21-
class MostBasicAgentArgs(AbstractAgentArgs):
18+
class MostBasicAgentArgs(AgentArgs):
2219
agent_name: str = "BasicAgent"
2320
temperature: float = 0.1
2421
use_chain_of_thought: bool = False
2522
chat_model_args: "BaseModelArgs" = None
2623

27-
def make_agent(self) -> Agent:
24+
def make_agent(self) -> bgym.Agent:
2825
return MostBasicAgent(
2926
temperature=self.temperature,
3027
use_chain_of_thought=self.use_chain_of_thought,
3128
chat_model_args=self.chat_model_args,
3229
)
3330

31+
def set_reproducibility_mode(self):
32+
self.temperature = 0
33+
3434
def prepare(self):
3535
return self.chat_model_args.prepare_server()
3636

3737
def close(self):
3838
return self.chat_model_args.close_server()
3939

4040

41-
class MostBasicAgent(Agent):
41+
class MostBasicAgent(bgym.Agent):
4242
def __init__(
4343
self, temperature: float, use_chain_of_thought: bool, chat_model_args: "BaseModelArgs"
4444
):
@@ -47,7 +47,7 @@ def __init__(
4747
self.chat = chat_model_args.make_model()
4848
self.chat_model_args = chat_model_args
4949

50-
self.action_set = HighLevelActionSet(["bid"], multiaction=False)
50+
self.action_set = bgym.HighLevelActionSet(["bid"], multiaction=False)
5151

5252
@cost_tracker_decorator
5353
def get_action(self, obs: Any) -> tuple[str, dict]:
@@ -104,7 +104,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
104104

105105
return (
106106
action,
107-
AgentInfo(
107+
bgym.AgentInfo(
108108
think=thought,
109109
chat_messages=messages,
110110
# put any stats that you care about as long as it is a number or a dict of numbers
@@ -115,17 +115,19 @@ def parser(response: str) -> tuple[dict, bool, str]:
115115
)
116116

117117

118-
env_args = EnvArgs(
118+
# example for a single task
119+
env_args = bgym.EnvArgs(
119120
task_name="miniwob.click-button",
120121
task_seed=0,
121122
max_steps=10,
122123
headless=True,
123124
)
124125

125-
chat_model_args = CHAT_MODEL_ARGS_DICT["azure/gpt-35-turbo/gpt-35-turbo"]
126+
chat_model_args = CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"]
126127

128+
# example for 2 experiments testing chain of thoughts on a miniwob task
127129
exp_args = [
128-
ExpArgs(
130+
bgym.ExpArgs(
129131
agent_args=MostBasicAgentArgs(
130132
temperature=0.1,
131133
use_chain_of_thought=True,
@@ -134,7 +136,7 @@ def parser(response: str) -> tuple[dict, bool, str]:
134136
env_args=env_args,
135137
logging_level=logging.INFO,
136138
),
137-
ExpArgs(
139+
bgym.ExpArgs(
138140
agent_args=MostBasicAgentArgs(
139141
temperature=0.1,
140142
use_chain_of_thought=False,

src/agentlab/analyze/agent_xray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def run_gradio(results_dir: Path):
481481
tabs.select(tab_select)
482482

483483
demo.queue()
484-
demo.launch(server_port=7899)
484+
demo.launch(server_port=7899, share=True)
485485

486486

487487
def tab_select(evt: gr.SelectData):

src/agentlab/experiments/reproducibility_util.py

Lines changed: 1 addition & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def get_reproducibility_info(
167167
changes_white_list=( # Files that are often modified during experiments but do not affect reproducibility
168168
"*/reproducibility_script.py",
169169
"*reproducibility_journal.csv",
170-
"*/launch_command.py",
170+
"*main.py",
171171
),
172172
ignore_changes=False,
173173
):
@@ -347,63 +347,6 @@ def _verify_report(report_df: pd.DataFrame, agent_names=list[str], strict_reprod
347347
)
348348
return report_df
349349

350-
# def add_reward(info, study_dir, ignore_incomplete=False):
351-
# """Add the average reward and standard error to the info dict.
352-
353-
# Verifies that all tasks are completed and that there are no errors.
354-
# """
355-
# result_df = inspect_results.load_result_df(study_dir)
356-
# report = inspect_results.summarize_study(result_df)
357-
358-
# if len(report) > 1:
359-
# raise ValueError("Multi agent not implemented yet")
360-
361-
# if isinstance(info["agent_names"], (list, tuple)):
362-
# if len(info["agent_names"]) > 1:
363-
# raise ValueError("Multi agent not implemented yet")
364-
365-
# idx = report.index[0]
366-
# n_err = report.loc[idx, "n_err"].item()
367-
# n_completed, n_total = report.loc[idx, "n_completed"].split("/")
368-
# if n_err > 0 and not ignore_incomplete:
369-
# raise ValueError(
370-
# f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
371-
# )
372-
# if n_completed != n_total and not ignore_incomplete:
373-
# raise ValueError(
374-
# f"Experiment has {n_completed} completed tasks out of {n_total}. "
375-
# f"Please rerun the study and make sure all tasks are completed."
376-
# )
377-
378-
# for key in ("avg_reward", "std_err", "n_err", "n_completed"):
379-
# value = report.loc[idx, key]
380-
# if hasattr(value, "item"):
381-
# value = value.item()
382-
# info[key] = value
383-
384-
if isinstance(info["agent_name"], (list, tuple)):
385-
if len(info["agent_name"]) > 1:
386-
raise ValueError("Multi agent not implemented yet")
387-
388-
idx = report.index[0]
389-
n_err = report.loc[idx, "n_err"].item()
390-
n_completed, n_total = report.loc[idx, "n_completed"].split("/")
391-
if n_err > 0 and not ignore_incomplete:
392-
raise ValueError(
393-
f"Experiment has {n_err} errors. Please rerun the study and make sure all tasks are completed."
394-
)
395-
if n_completed != n_total and not ignore_incomplete:
396-
raise ValueError(
397-
f"Experiment has {n_completed} completed tasks out of {n_total}. "
398-
f"Please rerun the study and make sure all tasks are completed."
399-
)
400-
401-
for key in ("avg_reward", "std_err", "n_err", "n_completed"):
402-
value = report.loc[idx, key]
403-
if hasattr(value, "item"):
404-
value = value.item()
405-
info[key] = value
406-
407350

408351
def _get_csv_headers(file_path: str) -> list[str]:
409352
with open(file_path, "r", newline="") as file:
@@ -464,10 +407,3 @@ def append_to_journal(
464407
writer = csv.writer(file)
465408
for row in rows:
466409
writer.writerow(row)
467-
468-
469-
def set_temp(agent_args: GenericAgentArgs, temperature=0):
470-
"""Set temperature to 0. Assumes a GenericAgent structure."""
471-
agent_args = deepcopy(agent_args)
472-
agent_args.chat_model_args.temperature = temperature
473-
return agent_args

src/agentlab/experiments/study_generators.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ def run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False)
6565
self.write_reproducibility_info(strict_reproducibility=strict_reproducibility)
6666

6767
run_experiments(n_jobs, self.exp_args_list, self.dir, parallel_backend=parallel_backend)
68+
report_df = self.get_report(ignore_cache=True)
69+
logging.info(f"Study {self.name} finished.")
70+
logging.info("\n" + str(report_df))
6871

6972
def append_to_journal(self, strict_reproducibility=True):
7073
"""Append the study to the journal.

src/agentlab/llm/llm_configs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import os
2-
31
from agentlab.llm.chat_api import (
42
AzureModelArgs,
53
OpenAIModelArgs,

0 commit comments

Comments
 (0)