|
| 1 | +import os |
| 2 | +from typing import Literal |
| 3 | + |
| 4 | +import datasets |
| 5 | +from tapeagents.environment import ContainerExecutor |
| 6 | +from tapeagents.tools.browser import Browser |
| 7 | +from tapeagents.tools.code_executor import CodeExecutor |
| 8 | +from tapeagents.tools.container_executor import init_code_sandbox |
| 9 | +from tapeagents.tools.media_reader import VideoReader |
| 10 | +from tapeagents.tools.web_search import WebSearch |
| 11 | + |
| 12 | +from agentlab.benchmarks.abstract_env import AbstractEnvArgs |
| 13 | +from agentlab.benchmarks.multitool_gym import MultiToolGym |
| 14 | + |
| 15 | + |
| 16 | +class GaiaGym(MultiToolGym): |
| 17 | + task: dict |
| 18 | + exp_dir: str |
| 19 | + |
| 20 | + |
| 21 | +class GaiaGymArgs(AbstractEnvArgs): |
| 22 | + task_id: str |
| 23 | + split: Literal["test", "validation"] |
| 24 | + exp_dir: str |
| 25 | + viewport_chars: int = 64000 |
| 26 | + |
| 27 | + def make_env(self) -> GaiaGym: |
| 28 | + init_code_sandbox(self.exp_dir) |
| 29 | + dataset = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all") |
| 30 | + tasks_by_id = {task["task_id"]: task for task in dataset[self.split]} |
| 31 | + task = tasks_by_id[self.task_id] |
| 32 | + tools = [ |
| 33 | + WebSearch(), |
| 34 | + VideoReader(self.exp_dir), |
| 35 | + Browser(self.exp_dir, viewport_chars=self.viewport_chars), |
| 36 | + CodeExecutor(self.exp_dir), |
| 37 | + ] |
| 38 | + env = GaiaGym(tools=tools, task=task, exp_dir=self.exp_dir) |
| 39 | + return env |
| 40 | + |
| 41 | + def init_code_sandbox(self) -> None: |
| 42 | + code_path = os.path.join(self.exp_dir, "code") |
| 43 | + os.makedirs(code_path, exist_ok=True) |
| 44 | + container_name = self.exp_dir.replace("/", "-") |
| 45 | + ContainerExecutor( |
| 46 | + work_dir=code_path, |
| 47 | + container_name=container_name, |
| 48 | + restart_if_exists=False, |
| 49 | + stop_container=False, |
| 50 | + no_deps=True, |
| 51 | + ) |
0 commit comments