config-driven gym with tools and bench

ollmer · ollmer · commit c74791512bdf · 2025-04-15T17:57:24.000+02:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "env": {
+                "AGENTLAB_DEBUG": "1"
+            }
+        }
+    ]
+}
diff --git a/scripts/run_gaia.py b/scripts/run_gaia.py
@@ -1,20 +1,25 @@
 import logging
+import os
 
-from agentlab.agents.tapeagent.agent import TapeAgentArgs
-from agentlab.benchmarks.gaia import GaiaBenchmark
+from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config
+from agentlab.benchmarks.gaia import GaiaBenchmark, stop_old_sandbox
 from agentlab.experiments.study import make_study
 
 fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
 logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
 
 if __name__ == "__main__":
+    config = load_config("gaia_l1")
     study = make_study(
-        benchmark=GaiaBenchmark(split="validation", level="1"),  # type: ignore
-        agent_args=TapeAgentArgs("gaia_agent"),
-        comment="Gaia eval",
+        benchmark=GaiaBenchmark.from_config(config),  # type: ignore
+        agent_args=TapeAgentArgs(agent_name=config.name, config=config),
+        comment=config.comment,
         logging_level=logging.INFO,
         logging_level_stdout=logging.INFO,
     )
-    # study.exp_args_list = study.exp_args_list[:3]
-    # study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
-    study.run(n_jobs=8, n_relaunch=1, parallel_backend="ray")
+    stop_old_sandbox()
+    if os.environ.get("AGENTLAB_DEBUG"):
+        study.exp_args_list = study.exp_args_list[:3]
+        study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
+    else:
+        study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend)
diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py
@@ -4,6 +4,7 @@
 
 import bgym
 import hydra
+from omegaconf import DictConfig
 from pydantic import Field
 from tapeagents.agent import Agent
 from tapeagents.core import Action, Observation, TapeMetadata, Thought
@@ -29,28 +30,32 @@ class Tape(BaseTape):
     metadata: ExtendedMetadata = Field(default_factory=ExtendedMetadata)  # type: ignore
 
 
+def load_config(config_name: str) -> DictConfig:
+    with hydra.initialize(config_path="conf", version_base="1.1"):
+        config = hydra.compose(config_name=config_name)
+    return config
+
+
 @dataclass
 class TapeAgentArgs(AgentArgs):
-    agent_name: str = "tape_agent"
+    config: DictConfig = None  # type: ignore
 
     def make_agent(self) -> bgym.Agent:
-        with hydra.initialize(config_path="conf", version_base="1.1"):
-            config = hydra.compose(config_name=self.agent_name)
-        agent: Agent = hydra.utils.instantiate(config)
+        agent: Agent = hydra.utils.instantiate(self.config.agent)
         return TapeAgent(agent=agent)
 
 
 @dataclass
 class TapeAgentInfo(bgym.AgentInfo):
-    thoughts: list[Thought] = None
+    thoughts: list[Thought] = None  # type: ignore
 
 
 class DictObservation(Observation):
     """
     Container for wrapping old dict observation into new Observation class.
     """
 
-    kind: Literal["dict_observation"] = "dict_observation"
+    kind: Literal["dict_observation"] = "dict_observation"  # type: ignore
     content: str
 
 
@@ -70,8 +75,8 @@ def obs_preprocessor(self, obs: Observation | list[Observation]) -> list[Observa
         logger.info(f"Observations: {[type(o).__name__ for o in obs]}")
         return obs
 
-    def get_action(self, obs: Observation | list[Observation]) -> tuple[str, TapeAgentInfo]:
-        self.tape += obs
+    def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, TapeAgentInfo]:
+        self.tape += obs  # type: ignore
         thoughts: list[Thought] = []
         action = None
         while not action:
diff --git a/src/agentlab/agents/tapeagent/conf/agent/plan_react.yaml b/src/agentlab/agents/tapeagent/conf/agent/plan_react.yaml
@@ -1,15 +1,13 @@
-defaults:
-  - llm@llms.default: gpt4o_mini
-  - _self_
-
 _target_: tapeagents.agent.Agent
 name : gaia_agent
 max_iterations: 2
+llms:
+  default: ${llm}
 tools_description: |
-  - WebSearch - Performs a search in the web, wikipedia or youtube
-  - VideoReader - Opens video from a youtube URL. Can access the video content, thumbnail, subtitles and audio.
+  - WebSearch - Performs web search.
+  - VideoReader - Opens video from a youtube URL.
   - Browser - Browser tool that can load web pages and interact with their content.
-  - CodeExecutor - Executes the python code snippet
+  - CodeExecutor - Executes the python code snippet.
 known_actions:
   - _target_: hydra.utils.get_class
     path: tapeagents.tools.web_search.SearchAction
@@ -64,18 +62,18 @@ templates:
 nodes:
   - _target_: tapeagents.nodes.StandardNode
     name: plan
-    system_prompt: ${templates.system_prompt}
+    system_prompt: ${agent.templates.system_prompt}
     guidance: |
       Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
       Remember that you can use web search, browser, python code execution and access the youtube videos to reach your goals.
       Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
       Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-      ${templates.thought_format}
-    steps_prompt: ${templates.allowed_tools}
+      ${agent.templates.thought_format}
+    steps_prompt: ${agent.templates.allowed_tools}
 
   - _target_: tapeagents.nodes.StandardNode
     name: facts_survey
-    system_prompt: ${templates.system_prompt}
+    system_prompt: ${agent.templates.system_prompt}
     guidance: |
       Before we begin executing the plan, please answer the following pre-survey.
       Here is the pre-survey:
@@ -84,16 +82,16 @@ nodes:
           3. Please list any facts that may need to be derived (e.g., via logical deduction, simulation, or computation)
           4. Please list any facts that are recalled from memory, hunches, well-reasoned guesses, etc.
       When answering this survey, keep in mind that "facts" will typically be specific names, dates, statistics, etc.
-      ${templates.thought_format}
-    steps_prompt: ${templates.allowed_tools}
+      ${agent.templates.thought_format}
+    steps_prompt: ${agent.templates.allowed_tools}
 
   - _target_: tapeagents.nodes.StandardNode
     name: act
-    system_prompt: ${templates.system_prompt}
+    system_prompt: ${agent.templates.system_prompt}
     guidance: |
       Produce single next step. If the answer is ready, produce gaia_answer_action.
-      ${templates.format}
-    steps_prompt: ${templates.allowed_steps}
+      ${agent.templates.format}
+    steps_prompt: ${agent.templates.allowed_steps}
     steps:
       - tapeagents.steps.ReasoningThought
       - agentlab.benchmarks.gaia.ExtractedFacts
diff --git a/src/agentlab/agents/tapeagent/conf/environment/web_code.yaml b/src/agentlab/agents/tapeagent/conf/environment/web_code.yaml
@@ -0,0 +1,11 @@
+tools:
+  - _target_: tapeagents.tools.web_search.WebSearch
+  - _target_: tapeagents.tools.media_reader.VideoReader
+    exp_path: ""
+  - _target_: tapeagents.tools.browser.Browser
+    exp_path: ""
+    viewport_chars: 64000
+    navigation_only: true
+  - _target_: tapeagents.tools.code_executor.CodeExecutor
+    exp_path: ""
+    reuse_computer_container: true
diff --git a/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml b/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - llm: gpt4o_mini
+  - agent: plan_react
+  - environment: web_code
+  - _self_
+
+name: gaia_agent
+comment: Gaia L1 val
+split: validation
+level: "1"
+parallel_backend: ray
+n_jobs: 10
diff --git a/src/agentlab/agents/tapeagent/conf/gaia_val.yaml b/src/agentlab/agents/tapeagent/conf/gaia_val.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - llm: gpt4o_mini
+  - agent: plan_react
+  - environment: web_code
+  - _self_
+
+name: gaia_agent
+comment: Gaia val
+split: validation
+level: "all"
+parallel_backend: ray
+n_jobs: 10
diff --git a/src/agentlab/benchmarks/gaia.py b/src/agentlab/benchmarks/gaia.py
@@ -1,30 +1,30 @@
-import fcntl
 import logging
 import os
 import re
 import shutil
 import string
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, Self
 
 import datasets
+import hydra
+import podman
+from omegaconf import DictConfig
 from pdf2image import convert_from_path
-from pydantic import Field
+from pydantic import ConfigDict, Field
 from tapeagents.core import Action, Observation, StopStep, Thought
 from tapeagents.environment import ContainerExecutor, StatefulTool, Tool
 from tapeagents.steps import ImageObservation
-from tapeagents.tools.browser import Browser
-from tapeagents.tools.code_executor import CodeExecutor
-from tapeagents.tools.media_reader import VideoReader
 from tapeagents.tools.simple_browser import SimpleTextBrowser
-from tapeagents.tools.web_search import WebSearch
 
 from agentlab.benchmarks.abstract_env import AbstractBenchmark, AbstractEnvArgs
 from agentlab.benchmarks.multitool_gym import MultiToolGym
 
 logger = logging.getLogger(__name__)
 
+CONTAINER_NAME = "gaia_code_shared"
+
 
 class GaiaGym(MultiToolGym):
     task: dict
@@ -61,30 +61,33 @@ def calculate_reward(self, action: Action) -> float:
 
 @dataclass
 class GaiaGymArgs(AbstractEnvArgs):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     task: dict[str, Any]
-    viewport_chars: int
     task_seed: int
     task_name: str
+    env_config: DictConfig
 
     def __init__(
-        self, task_name: str, task: dict[str, Any], viewport_chars: int = 64000, task_seed: int = 0
+        self,
+        task_name: str,
+        task: dict[str, Any],
+        env_config: DictConfig,
+        task_seed: int = 0,
     ):
         self.task_name = task_name
         self.task = task
-        self.viewport_chars = viewport_chars
         self.task_seed = task_seed
+        self.env_config = env_config
 
     def make_env(self, exp_dir: str | Path, action_mapping=None) -> GaiaGym:
         exp_dir = str(exp_dir)
         logger.info(f"Init gaia env with directory {exp_dir}")
         os.environ["TAPEAGENTS_SQLITE_DB"] = os.path.join(exp_dir, "tapedata.sqlite")
         init_code_sandbox(exp_dir)
-        tools = [
-            WebSearch(),
-            VideoReader(exp_path=exp_dir),
-            Browser(exp_path=exp_dir, viewport_chars=self.viewport_chars, navigation_only=True),
-            CodeExecutor(exp_path=exp_dir, reuse_computer_container=True),
-        ]
+        for i in range(len(self.env_config.tools)):
+            if hasattr(self.env_config.tools[i], "exp_path"):
+                self.env_config.tools[i].exp_path = exp_dir
+        tools = hydra.utils.instantiate(self.env_config.tools)
         env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir)
         return env
 
@@ -94,27 +97,43 @@ def init_code_sandbox(exp_dir: str) -> None:
     root_exp_dir = Path(exp_dir).parent
     code_path = os.path.join(root_exp_dir, "shared_code")
     os.makedirs(code_path, exist_ok=True)
-
-    container_name = "gaia_code_shared"
-    os.environ["COMPUTER_CONTAINER_NAME"] = container_name
+    os.environ["COMPUTER_CONTAINER_NAME"] = CONTAINER_NAME
 
     # symlink task code to the shared code directory
     task_code_path = os.path.join(exp_dir, "code")
     if not os.path.exists(task_code_path):
         os.symlink(code_path, task_code_path)
 
     try:
-        ContainerExecutor(container_name=container_name, work_dir=code_path, no_deps=True)
+        ContainerExecutor(container_name=CONTAINER_NAME, work_dir=code_path, no_deps=True)
     except Exception as e:
         logger.warning(f"Failed to initialize container executor: {e}")
 
 
+def stop_old_sandbox():
+    try:
+        podman.from_env().containers.get(CONTAINER_NAME).stop()
+    except Exception as e:
+        logger.warning(f"Failed to stop old container {CONTAINER_NAME}: {e}")
+
+
 class GaiaBenchmark(AbstractBenchmark):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     name: str = "gaia"
     split: Literal["test", "validation"]
     level: Literal["1", "2", "3", "all"] = "all"
     env_args_list: list[GaiaGymArgs] = None  # type: ignore
     dataset: dict = None  # type: ignore
+    env_config: DictConfig = None  # type: ignore
+
+    @classmethod
+    def from_config(cls, config: DictConfig, dataset: dict = None) -> Self:
+        return cls(
+            split=config.split,
+            level=config.level,
+            env_config=config.environment,
+            dataset=dataset,
+        )
 
     def model_post_init(self, __context: Any) -> None:
         if not self.dataset:
@@ -130,7 +149,8 @@ def model_post_init(self, __context: Any) -> None:
                 continue
             number += 1
             task["number"] = number
-            env_args = GaiaGymArgs(task_name="gaia." + task["task_id"], task=task)
+            name = f"gaia.{task['task_id']}"
+            env_args = GaiaGymArgs(task_name=name, task=task, env_config=self.env_config)
             self.env_args_list.append(env_args)
         logger.info(f"Loaded {len(self.env_args_list)} tasks from {self.split} split")
 
diff --git a/tests/agents/test_gaia_agent.py b/tests/agents/test_gaia_agent.py