add GAIA gym

ollmer · ollmer · commit 0791f2dd17f8 · 2025-03-13T11:59:10.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,7 @@ joblib>=1.2.0
 openai>=1.7,<2
 langchain_community
 tiktoken
-tapeagents[converters]~=0.1.4
+tapeagents[converters]
 huggingface_hub
 contexttimer
 ipython
diff --git a/src/agentlab/benchmarks/abstract_env.py b/src/agentlab/benchmarks/abstract_env.py
@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
 
 import gym
+from pydantic import BaseModel
 
 
-class AbstractEnvArgs(ABC):
+class AbstractEnvArgs(BaseModel):
     """Easily serialiazable class to store the arguments of an environment"""
 
     @abstractmethod
diff --git a/src/agentlab/benchmarks/gaia.py b/src/agentlab/benchmarks/gaia.py
@@ -0,0 +1,51 @@
+import os
+from typing import Literal
+
+import datasets
+from tapeagents.environment import ContainerExecutor
+from tapeagents.tools.browser import Browser
+from tapeagents.tools.code_executor import CodeExecutor
+from tapeagents.tools.container_executor import init_code_sandbox
+from tapeagents.tools.media_reader import VideoReader
+from tapeagents.tools.web_search import WebSearch
+
+from agentlab.benchmarks.abstract_env import AbstractEnvArgs
+from agentlab.benchmarks.multitool_gym import MultiToolGym
+
+
+class GaiaGym(MultiToolGym):
+    task: dict
+    exp_dir: str
+
+
+class GaiaGymArgs(AbstractEnvArgs):
+    task_id: str
+    split: Literal["test", "validation"]
+    exp_dir: str
+    viewport_chars: int = 64000
+
+    def make_env(self) -> GaiaGym:
+        init_code_sandbox(self.exp_dir)
+        dataset = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")
+        tasks_by_id = {task["task_id"]: task for task in dataset[self.split]}
+        task = tasks_by_id[self.task_id]
+        tools = [
+            WebSearch(),
+            VideoReader(self.exp_dir),
+            Browser(self.exp_dir, viewport_chars=self.viewport_chars),
+            CodeExecutor(self.exp_dir),
+        ]
+        env = GaiaGym(tools=tools, task=task, exp_dir=self.exp_dir)
+        return env
+
+    def init_code_sandbox(self) -> None:
+        code_path = os.path.join(self.exp_dir, "code")
+        os.makedirs(code_path, exist_ok=True)
+        container_name = self.exp_dir.replace("/", "-")
+        ContainerExecutor(
+            work_dir=code_path,
+            container_name=container_name,
+            restart_if_exists=False,
+            stop_container=False,
+            no_deps=True,
+        )
diff --git a/src/agentlab/benchmarks/multitool_gym.py b/src/agentlab/benchmarks/multitool_gym.py
@@ -51,8 +51,9 @@ def __init__(self, tools: list[Tool | Multitool]):
         )
         self.reset()
 
-    def reset(self, seed=None):
+    def reset(self):
         self._tape: EnvTape = EnvTape(steps=[])
+        self._env.reset()
 
     def step(self, action: str):
         try:

Original file line number	Diff line number	Diff line change
`@@ -51,8 +51,9 @@ def __init__(self, tools: list[Tool \| Multitool]):`
`51`	`51`	`)`
`52`	`52`	`self.reset()`
`53`	`53`
`54`		`- def reset(self, seed=None):`
	`54`	`+ def reset(self):`
`55`	`55`	`self._tape: EnvTape = EnvTape(steps=[])`
	`56`	`+ self._env.reset()`
`56`	`57`
`57`	`58`	`def step(self, action: str):`
`58`	`59`	`try:`