Skip to content

Commit 2939d43

Browse files
committed
configs and updates to run gaia eval with apriel
1 parent 04358c3 commit 2939d43

File tree

7 files changed

+57
-28
lines changed

7 files changed

+57
-28
lines changed

src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ known_actions:
1616
- _target_: hydra.utils.get_class
1717
path: tapeagents.tools.code_executor.PythonCodeAction
1818
- _target_: hydra.utils.get_class
19-
path: tapeagents.tools.browser.ClickAction
19+
path: tapeagents.tools.browser.ClickBIDAction
2020
- _target_: hydra.utils.get_class
2121
path: tapeagents.tools.browser.GoBackAction
2222
- _target_: hydra.utils.get_class
@@ -82,13 +82,12 @@ nodes:
8282
- _target_: tapeagents.nodes.StandardNode
8383
name: act
8484
system_prompt: ${agent.templates.system_prompt}
85-
guidance: |
86-
Produce single next step. If the answer is ready, produce gaia_answer_action.
87-
${agent.templates.format}
85+
guidance: Then produce single function call for the next step. If the answer is ready, call GaiaAnswer.
8886
steps_prompt: ${agent.templates.allowed_steps}
8987
steps:
9088
- tapeagents.steps.ReasoningThought
9189
- agentlab.benchmarks.gaia.ExtractedFacts
9290
- agentlab.benchmarks.gaia.GaiaAnswer
9391
use_known_actions: true
92+
use_function_calls: true
9493
next_node: act
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
defaults:
2-
- llm: o4mini
2+
- llm: apriel
33
- agent: plan_act
44
- environment: web_code
55
- _self_
66

7-
name: gaia_agent
7+
name: Apriel1p5
88
comment: Gaia L1 val
99
split: validation
1010
level: "1"
11+
max_turns: 20
1112
parallel_backend: ray
12-
n_jobs: 10
13+
n_jobs: 20
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
defaults:
2-
- llm: gpt4o_mini
2+
- llm: apriel
33
- agent: plan_act
44
- environment: web_code
55
- _self_
66

7-
name: gaia_agent
7+
name: Apriel1p5
88
comment: Gaia val
99
split: validation
1010
level: "all"
11+
max_turns: 20
1112
parallel_backend: ray
12-
n_jobs: 10
13+
n_jobs: 20
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
_target_: tapeagents.llms.TrainableLLM
2+
model_name: Apriel1p5
3+
tokenizer_name: ServiceNow-AI/Apriel-1.5-15b-Thinker
4+
stream: false
5+
use_cache: false
6+
context_size: 128000
7+
base_url: localhost:8000
8+
parameters:
9+
temperature: 0.6
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
## Setup instructions
2+
- you need podman installed to run code execution and a serper.dev api key to use web search
3+
- to install and configure podman on a mac use provided script `src/agentlab/agents/tapeagent/experiments/setup_gaia.sh`
4+
- after the podman machine up and running set DOCKER_HOST env var to its socket: `export DOCKER_HOST=http+unix://$(podman machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}')`
5+
- set the env var with the serper dev api key: `export SERPER_API_KEY=your_key`
6+
- set the env var with the url to the inference endpoint: `export LLM_BASE_URL=your_enpoint_url`
7+
8+
## Experiment configs:
9+
- main config: `src/agentlab/agents/tapeagent/conf/gaia_l1.yaml` for L1 subset, `src/agentlab/agents/tapeagent/conf/gaia_val.yaml` for full validation set
10+
- llm configs are in `src/agentlab/agents/tapeagent/conf/llm`. Feel free to add your own
11+
- recommended agent architecture to use is `src/agentlab/agents/tapeagent/conf/agent/plan_act.yaml`. It is already used in the main configs mentioned above.
12+
- env config that describes available tools: `src/agentlab/agents/tapeagent/conf/environment/web_code.yaml`
13+
14+
## Running evaluation:
15+
- to run in debug mode without parallelism: `AGENTLAB_DEBUG=1 python src/agentlab/agents/tapeagent/experiments/run_gaia.py`
16+
- to run quick parallel eval: `python src/agentlab/agents/tapeagent/experiments/run_gaia.py`
17+
- you can adjust content of the entrypoint script `src/agentlab/agents/tapeagent/experiments/run_gaia.py` to change config name.
18+
- when parallel eval is running, Ray dahsboard with progress is available at `http://127.0.0.1:8265/#/jobs/01000000`
19+
- experiment results will be written in subfolder of '~/agentlab_results/` with the name including current datetime, agent name and benchmark name

src/agentlab/agents/tapeagent/experiments/run_gaia.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
if __name__ == "__main__":
1212
config = load_config("gaia_l1")
13+
config.llm.base_url = os.environ["LLM_BASE_URL"]
1314
study = make_study(
1415
benchmark=GaiaBenchmark.from_config(config), # type: ignore
1516
agent_args=TapeAgentArgs(agent_name=config.name, config=config),

src/agentlab/benchmarks/gaia.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class GaiaGym(MultiToolGym):
3131
task: dict
3232
exp_dir: str
3333

34-
def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str):
35-
super().__init__(tools=tools)
34+
def __init__(self, tools: list[Tool | StatefulTool], task: dict, exp_dir: str, max_turns: int):
35+
super().__init__(tools=tools, max_turns=max_turns)
3636
self.task = task
3737
self.exp_dir = exp_dir
3838
os.makedirs(".cache", exist_ok=True)
@@ -67,20 +67,9 @@ class GaiaGymArgs(AbstractEnvArgs):
6767
task_seed: int
6868
task_name: str
6969
env_config: DictConfig
70+
max_turns: int
7071

71-
def __init__(
72-
self,
73-
task_name: str,
74-
task: dict[str, Any],
75-
env_config: DictConfig,
76-
task_seed: int = 0,
77-
):
78-
self.task_name = task_name
79-
self.task = task
80-
self.task_seed = task_seed
81-
self.env_config = env_config
82-
83-
def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym:
72+
def make_env(self, exp_dir: Path, action_mapping=None, **kwargs) -> GaiaGym:
8473
tapeagents.config.DB_DEFAULT_FILENAME = str(exp_dir.parent / "tapedata.sqlite")
8574
exp_dir_str = str(exp_dir)
8675
logger.info(f"Init gaia env with directory {exp_dir_str}")
@@ -89,7 +78,7 @@ def make_env(self, exp_dir: Path, action_mapping=None) -> GaiaGym:
8978
if hasattr(self.env_config.tools[i], "exp_path"):
9079
self.env_config.tools[i].exp_path = exp_dir_str
9180
tools = hydra.utils.instantiate(self.env_config.tools)
92-
env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str)
81+
env = GaiaGym(tools=tools, task=self.task, exp_dir=exp_dir_str, max_turns=self.max_turns)
9382
return env
9483

9584

@@ -122,6 +111,7 @@ class GaiaBenchmark(AbstractBenchmark):
122111
model_config = ConfigDict(arbitrary_types_allowed=True)
123112
name: str = "gaia"
124113
split: Literal["test", "validation"]
114+
max_turns: int = 20
125115
level: Literal["1", "2", "3", "all"] = "all"
126116
env_args_list: list[GaiaGymArgs] = None # type: ignore
127117
dataset: dict | None = None # type: ignore
@@ -134,6 +124,7 @@ def from_config(cls, config: DictConfig, dataset: dict | None = None) -> Self:
134124
level=config.level,
135125
env_config=config.environment,
136126
dataset=dataset,
127+
max_turns=config.max_turns,
137128
)
138129

139130
def model_post_init(self, __context: Any) -> None:
@@ -151,7 +142,14 @@ def model_post_init(self, __context: Any) -> None:
151142
number += 1
152143
task["number"] = number
153144
name = f"gaia.{task['task_id']}"
154-
env_args = GaiaGymArgs(task_name=name, task=task, env_config=self.env_config)
145+
task_seed = 0
146+
env_args = GaiaGymArgs(
147+
task_name=name,
148+
task=task,
149+
task_seed=task_seed,
150+
env_config=self.env_config,
151+
max_turns=self.max_turns,
152+
)
155153
self.env_args_list.append(env_args)
156154
logger.info(f"Loaded {len(self.env_args_list)} tasks from {self.split} split")
157155

@@ -192,7 +190,8 @@ def task_to_observations(task: dict, max_doc_length: int = 8000) -> list[Observa
192190
if not question.filename:
193191
return [question]
194192

195-
filename: str | None = question.filename
193+
filename: str = question.filename
194+
assert os.path.exists(filename), f"Attachment {filename} does not exist!"
196195
question.filename = None
197196
steps: list[Observation] = []
198197
name, ext = filename.rsplit(".", maxsplit=1)

0 commit comments

Comments
 (0)