diff --git a/src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml b/src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml index 017128cf..1e0ea55e 100644 --- a/src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml +++ b/src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml @@ -16,7 +16,7 @@ known_actions: - _target_: hydra.utils.get_class path: tapeagents.tools.code_executor.PythonCodeAction - _target_: hydra.utils.get_class - path: tapeagents.tools.browser.ClickAction + path: tapeagents.tools.browser.ClickBIDAction - _target_: hydra.utils.get_class path: tapeagents.tools.browser.GoBackAction - _target_: hydra.utils.get_class @@ -82,13 +82,12 @@ nodes: - _target_: tapeagents.nodes.StandardNode name: act system_prompt: ${agent.templates.system_prompt} - guidance: | - Produce single next step. If the answer is ready, produce gaia_answer_action. - ${agent.templates.format} + guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer. steps_prompt: ${agent.templates.allowed_steps} steps: - tapeagents.steps.ReasoningThought - agentlab.benchmarks.gaia.ExtractedFacts - agentlab.benchmarks.gaia.GaiaAnswer use_known_actions: true + use_function_calls: true next_node: act \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml b/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml index bbd2a11d..0df163af 100644 --- a/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml +++ b/src/agentlab/agents/tapeagent/conf/gaia_l1.yaml @@ -1,12 +1,13 @@ defaults: - - llm: o4mini + - llm: apriel - agent: plan_act - environment: web_code - _self_ -name: gaia_agent +name: Apriel1p5 comment: Gaia L1 val split: validation level: "1" +max_turns: 20 parallel_backend: ray -n_jobs: 10 \ No newline at end of file +n_jobs: 20 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/gaia_val.yaml b/src/agentlab/agents/tapeagent/conf/gaia_val.yaml index c867cecf..a703dab7 100644 --- a/src/agentlab/agents/tapeagent/conf/gaia_val.yaml +++ b/src/agentlab/agents/tapeagent/conf/gaia_val.yaml @@ -1,12 +1,13 @@ defaults: - - llm: gpt4o_mini + - llm: apriel - agent: plan_act - environment: web_code - _self_ -name: gaia_agent +name: Apriel1p5 comment: Gaia val split: validation level: "all" +max_turns: 20 parallel_backend: ray -n_jobs: 10 \ No newline at end of file +n_jobs: 20 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/llm/apriel.yaml b/src/agentlab/agents/tapeagent/conf/llm/apriel.yaml new file mode 100644 index 00000000..14892888 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/llm/apriel.yaml @@ -0,0 +1,9 @@ +_target_: tapeagents.llms.TrainableLLM +model_name: Apriel1p5 +tokenizer_name: ServiceNow-AI/Apriel-1.5-15b-Thinker +stream: false +use_cache: false +context_size: 128000 +base_url: localhost:8000 +parameters: + temperature: 0.6 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/experiments/gaia.md b/src/agentlab/agents/tapeagent/experiments/gaia.md new file mode 100644 index 00000000..4bd55f11 --- /dev/null +++ b/src/agentlab/agents/tapeagent/experiments/gaia.md @@ -0,0 +1,19 @@ +## Setup instructions +- you need podman installed to run code execution and a serper.dev api key to use web search +- to install and configure podman on a mac use provided script `src/agentlab/agents/tapeagent/experiments/setup_gaia.sh` +- after the podman machine up and running set DOCKER_HOST env var to its socket: `export DOCKER_HOST=http+unix://$(podman machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}')` +- set the env var with the serper dev api key: `export SERPER_API_KEY=your_key` +- set the env var with the url to the inference endpoint: `export LLM_BASE_URL=your_enpoint_url` + +## Experiment configs: +- main config: `src/agentlab/agents/tapeagent/conf/gaia_l1.yaml` for L1 subset, `src/agentlab/agents/tapeagent/conf/gaia_val.yaml` for full validation set +- llm configs are in `src/agentlab/agents/tapeagent/conf/llm`. Feel free to add your own +- recommended agent architecture to use is `src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml`. It is already used in the main configs mentioned above. +- env config that describes available tools: `src/agentlab/agents/tapeagent/conf/environment/web_code.yaml` + +## Running evaluation: +- to run in debug mode without parallelism: `AGENTLAB_DEBUG=1 python src/agentlab/agents/tapeagent/experiments/run_gaia.py` +- to run quick parallel eval: `python src/agentlab/agents/tapeagent/experiments/run_gaia.py` +- you can adjust content of the entrypoint script `src/agentlab/agents/tapeagent/experiments/run_gaia.py` to change config name. +- when parallel eval is running, Ray dahsboard with progress is available at `http://127.0.0.1:8265/#/jobs/01000000` +- experiment results will be written in subfolder of '~/agentlab_results/` with the name including current datetime, agent name and benchmark name diff --git a/src/agentlab/agents/tapeagent/experiments/run_gaia.py b/src/agentlab/agents/tapeagent/experiments/run_gaia.py index ca613d46..82eebb14 100644 --- a/src/agentlab/agents/tapeagent/experiments/run_gaia.py +++ b/src/agentlab/agents/tapeagent/experiments/run_gaia.py @@ -10,6 +10,7 @@ if __name__ == "__main__": config = load_config("gaia_l1") + config.llm.base_url = os.environ["LLM_BASE_URL"] study = make_study( benchmark=GaiaBenchmark.from_config(config), # type: ignore agent_args=TapeAgentArgs(agent_name=config.name, config=config), diff --git a/src/agentlab/benchmarks/gaia.py b/src/agentlab/benchmarks/gaia.py index 0468e305..7f64efa7 100644 --- a/src/agentlab/benchmarks/gaia.py +++ b/src/agentlab/benchmarks/gaia.py @@ -31,8 +31,8 @@ class GaiaGym(MultiToolGym): task: dict exp_dir: str - def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str): - super().__init__(tools=tools) + def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str, max_turns: int): + super().__init__(tools=tools, max_turns=max_turns) self.task = task self.exp_dir = exp_dir os.makedirs(".cache", exist_ok=True) @@ -67,20 +67,9 @@ class GaiaGymArgs(AbstractEnvArgs): task_seed: int task_name: str env_config: DictConfig + max_turns: int - def __init__( - self, - task_name: str, - task: dict[str, Any], - env_config: DictConfig, - task_seed: int = 0, - ): - self.task_name = task_name - self.task = task - self.task_seed = task_seed - self.env_config = env_config - - def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym: + def make_env(self, exp_dir: Path, action_mapping=None, **kwargs) -> GaiaGym: tapeagents.config.DB_DEFAULT_FILENAME = str(exp_dir.parent / "tapedata.sqlite") exp_dir_str = str(exp_dir) logger.info(f"Init gaia env with directory {exp_dir_str}") @@ -89,7 +78,7 @@ def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym: if hasattr(self.env_config.tools[i], "exp_path"): self.env_config.tools[i].exp_path = exp_dir_str tools = hydra.utils.instantiate(self.env_config.tools) - env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str) + env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str, max_turns=self.max_turns) return env @@ -122,6 +111,7 @@ class GaiaBenchmark(AbstractBenchmark): model_config = ConfigDict(arbitrary_types_allowed=True) name: str = "gaia" split: Literal["test", "validation"] + max_turns: int = 20 level: Literal["1", "2", "3", "all"] = "all" env_args_list: list[GaiaGymArgs] = None # type: ignore dataset: dict | None = None # type: ignore @@ -134,6 +124,7 @@ def from_config(cls, config: DictConfig, dataset: dict | None = None) -> Self: level=config.level, env_config=config.environment, dataset=dataset, + max_turns=config.max_turns, ) def model_post_init(self, __context: Any) -> None: @@ -151,7 +142,14 @@ def model_post_init(self, __context: Any) -> None: number += 1 task["number"] = number name = f"gaia.{task['task_id']}" - env_args = GaiaGymArgs(task_name=name, task=task, env_config=self.env_config) + task_seed = 0 + env_args = GaiaGymArgs( + task_name=name, + task=task, + task_seed=task_seed, + env_config=self.env_config, + max_turns=self.max_turns, + ) self.env_args_list.append(env_args) logger.info(f"Loaded {len(self.env_args_list)} tasks from {self.split} split") @@ -192,7 +190,8 @@ def task_to_observations(task: dict, max_doc_length: int = 8000) -> list[Observa if not question.filename: return [question] - filename: str | None = question.filename + filename: str = question.filename + assert os.path.exists(filename), f"Attachment {filename} does not exist!" question.filename = None steps: list[Observation] = [] name, ext = filename.rsplit(".", maxsplit=1)