exp for new models

recursix · recursix · commit 3608dd69a20c · 2025-06-30T19:20:09.000-04:00
diff --git a/main_exp_new_models.py b/main_exp_new_models.py
@@ -0,0 +1,71 @@
+"""
+Note: This script is a convenience script to launch experiments instead of using
+the command line.
+
+Copy this script and modify at will, but don't push your changes to the
+repository.
+"""
+
+import logging
+
+from agentlab.agents.generic_agent import (
+    CHAT_MODEL_ARGS_DICT,
+    FLAGS_GPT_4o,
+    GenericAgentArgs,
+)
+from agentlab.experiments.study import Study
+
+logging.getLogger().setLevel(logging.INFO)
+
+agent_args = [
+    GenericAgentArgs(
+        chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-mini-2025-04-14"],
+        flags=FLAGS_GPT_4o,
+    )
+]
+
+
+# ## select the benchmark to run on
+benchmark = "miniwob_tiny_test"
+# benchmark = "miniwob"
+# benchmark = "workarena_l1"
+# benchmark = "workarena_l2"
+# benchmark = "workarena_l3"
+# benchmark = "webarena"
+
+# Set reproducibility_mode = True for reproducibility
+# this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have
+# local changes. For your custom agents you need to implement set_reproducibility_mode
+reproducibility_mode = False
+
+# Set relaunch = True to relaunch an existing study, this will continue incomplete
+# experiments and relaunch errored experiments
+relaunch = False
+
+## Number of parallel jobs
+n_jobs = 4  # Make sure to use 1 job when debugging in VSCode
+# n_jobs = -1  # to use all available cores
+
+
+if __name__ == "__main__":  # necessary for dask backend
+
+    if reproducibility_mode:
+        [a.set_reproducibility_mode() for a in agent_args]
+
+    if relaunch:
+        #  relaunch an existing study
+        study = Study.load_most_recent(contains=None)
+        study.find_incomplete(include_errors=True)
+
+    else:
+        study = Study(agent_args, benchmark, logging_level_stdout=logging.WARNING)
+
+    study.run(
+        n_jobs=n_jobs,
+        parallel_backend="ray",  # "ray", "joblib" or "sequential"
+        strict_reproducibility=reproducibility_mode,
+        n_relaunch=3,
+    )
+
+    if reproducibility_mode:
+        study.append_to_journal(strict_reproducibility=True)
diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py
@@ -9,20 +9,23 @@
 from .agent_configs import (
     AGENT_3_5,
     AGENT_8B,
+    AGENT_37_SONNET,
+    AGENT_CLAUDE_SONNET_35,
+    AGENT_CLAUDE_SONNET_35_VISION,
     AGENT_CUSTOM,
-    AGENT_LLAMA4_17B_INSTRUCT,
     AGENT_LLAMA3_70B,
+    AGENT_LLAMA4_17B_INSTRUCT,
     AGENT_LLAMA31_70B,
+    CHAT_MODEL_ARGS_DICT,
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
     AGENT_4o_MINI,
-    AGENT_CLAUDE_SONNET_35,
-    AGENT_37_SONNET,
-    AGENT_CLAUDE_SONNET_35_VISION,
-    AGENT_4o_VISION,
     AGENT_4o_MINI_VISION,
-    AGENT_o3_MINI,
+    AGENT_4o_VISION,
     AGENT_o1_MINI,
+    AGENT_o3_MINI,
+    FLAGS_GPT_4o,
+    GenericAgentArgs,
 )
 
 __all__ = [
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
@@ -17,6 +17,20 @@
 ]
 
 CHAT_MODEL_ARGS_DICT = {
+    "openai/gpt-4.1-mini-2025-04-14": OpenAIModelArgs(
+        model_name="gpt-4.1-mini-2025-04-14",
+        max_total_tokens=128_000,
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,
+        vision_support=True,
+    ),
+    "openai/gpt-4.1-2025-04-14": OpenAIModelArgs(
+        model_name="gpt-4.1-2025-04-14",
+        max_total_tokens=128_000,
+        max_input_tokens=128_000,
+        max_new_tokens=16_384,
+        vision_support=True,
+    ),
     "openai/o3-mini-2025-01-31": OpenAIModelArgs(
         model_name="o3-mini-2025-01-31",
         max_total_tokens=200_000,