|
| 1 | +""" |
| 2 | +Note: This script is a convenience script to launch experiments instead of using |
| 3 | +the command line. |
| 4 | +
|
| 5 | +Copy this script and modify at will, but don't push your changes to the |
| 6 | +repository. |
| 7 | +""" |
| 8 | + |
| 9 | +import logging |
| 10 | + |
| 11 | +from agentlab.agents.generic_agent import ( |
| 12 | + CHAT_MODEL_ARGS_DICT, |
| 13 | + FLAGS_GPT_4o, |
| 14 | + GenericAgentArgs, |
| 15 | +) |
| 16 | +from agentlab.experiments.study import Study |
| 17 | + |
| 18 | +logging.getLogger().setLevel(logging.INFO) |
| 19 | + |
| 20 | +agent_args = [ |
| 21 | + GenericAgentArgs( |
| 22 | + chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-mini-2025-04-14"], |
| 23 | + flags=FLAGS_GPT_4o, |
| 24 | + ) |
| 25 | +] |
| 26 | + |
| 27 | + |
| 28 | +# ## select the benchmark to run on |
| 29 | +benchmark = "miniwob_tiny_test" |
| 30 | +# benchmark = "miniwob" |
| 31 | +# benchmark = "workarena_l1" |
| 32 | +# benchmark = "workarena_l2" |
| 33 | +# benchmark = "workarena_l3" |
| 34 | +# benchmark = "webarena" |
| 35 | + |
| 36 | +# Set reproducibility_mode = True for reproducibility |
| 37 | +# this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have |
| 38 | +# local changes. For your custom agents you need to implement set_reproducibility_mode |
| 39 | +reproducibility_mode = False |
| 40 | + |
| 41 | +# Set relaunch = True to relaunch an existing study, this will continue incomplete |
| 42 | +# experiments and relaunch errored experiments |
| 43 | +relaunch = False |
| 44 | + |
| 45 | +## Number of parallel jobs |
| 46 | +n_jobs = 4 # Make sure to use 1 job when debugging in VSCode |
| 47 | +# n_jobs = -1 # to use all available cores |
| 48 | + |
| 49 | + |
| 50 | +if __name__ == "__main__": # necessary for dask backend |
| 51 | + |
| 52 | + if reproducibility_mode: |
| 53 | + [a.set_reproducibility_mode() for a in agent_args] |
| 54 | + |
| 55 | + if relaunch: |
| 56 | + # relaunch an existing study |
| 57 | + study = Study.load_most_recent(contains=None) |
| 58 | + study.find_incomplete(include_errors=True) |
| 59 | + |
| 60 | + else: |
| 61 | + study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING) |
| 62 | + |
| 63 | + study.run( |
| 64 | + n_jobs=n_jobs, |
| 65 | + parallel_backend="ray", # "ray", "joblib" or "sequential" |
| 66 | + strict_reproducibility=reproducibility_mode, |
| 67 | + n_relaunch=3, |
| 68 | + ) |
| 69 | + |
| 70 | + if reproducibility_mode: |
| 71 | + study.append_to_journal(strict_reproducibility=True) |
0 commit comments