working gaia agent

ollmer · ollmer · commit ec8be7566388 · 2025-03-13T13:05:03.000+01:00
diff --git a/conf/gaia_agent.yaml b/conf/gaia_agent.yaml
@@ -1,8 +1,10 @@
+defaults:
+  - llm@llms.default: gpt4o
+  - _self_
+
 _target_: tapeagents.agent.Agent
-name : web_agent
+name : gaia_agent
 max_iterations: 2
-llms:
-  default: ${llm}
 templates:
   system_prompt: |
     You are an expert AI Agent trained to assist users with complex information processing tasks.
@@ -31,18 +33,18 @@ templates:
 nodes:
   - _target_: tapeagents.nodes.StandardNode
     name: plan
-    system_prompt: ${agent.templates.system_prompt}
+    system_prompt: ${templates.system_prompt}
     guidance: |
       Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task.
       Remember that you can use web search, browser, python code execution and access the youtube videos to reach your goals.
       Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet.
       Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed.
-      ${agent.templates.thought_format}
-    steps_prompt: ${agent.templates.allowed_tools}
+      ${templates.thought_format}
+    steps_prompt: ${templates.allowed_tools}
 
   - _target_: tapeagents.nodes.StandardNode
     name: facts_survey
-    system_prompt: ${agent.templates.system_prompt}
+    system_prompt: ${templates.system_prompt}
     guidance: |
       Before we begin executing the plan, please answer the following pre-survey.
       Here is the pre-survey:
@@ -51,19 +53,19 @@ nodes:
           3. Please list any facts that may need to be derived (e.g., via logical deduction, simulation, or computation)
           4. Please list any facts that are recalled from memory, hunches, well-reasoned guesses, etc.
       When answering this survey, keep in mind that "facts" will typically be specific names, dates, statistics, etc.
-      ${agent.templates.thought_format}
-    steps_prompt: ${agent.templates.allowed_tools}
+      ${templates.thought_format}
+    steps_prompt: ${templates.allowed_tools}
 
   - _target_: tapeagents.nodes.StandardNode
     name: act
-    system_prompt: ${agent.templates.system_prompt}
+    system_prompt: ${templates.system_prompt}
     guidance: |
       Produce single next step. If the answer is ready, produce gaia_answer_action.
-      ${agent.templates.format}
-    steps_prompt: ${agent.templates.allowed_steps}
+      ${templates.format}
+    steps_prompt: ${templates.allowed_steps}
     steps:
       - tapeagents.steps.ReasoningThought
-      - examples.gaia_agent.steps.ExtractedFacts
-      - examples.gaia_agent.steps.GaiaAnswer
+      - agentlab.benchmarks.gaia.ExtractedFacts
+      - agentlab.benchmarks.gaia.GaiaAnswer
     use_known_actions: true
     next_node: act
diff --git a/conf/llm/gpt4o.yaml b/conf/llm/gpt4o.yaml
@@ -0,0 +1,6 @@
+_target_: tapeagents.llms.LiteLLM
+model_name: gpt-4o-2024-08-06
+use_cache: false
+context_size: 128000
+parameters:
+  temperature: 0.2
diff --git a/requirements.txt b/requirements.txt
@@ -26,3 +26,4 @@ matplotlib
 ray[default]
 python-slugify
 pillow
+gymnasium>=0.27
diff --git a/src/agentlab/agents/tapeagent.py b/src/agentlab/agents/tapeagent.py
@@ -15,11 +15,11 @@
 
 @dataclass
 class TapeAgentArgs(AgentArgs):
-    config_name: str
+    agent_name: str
 
     def make_agent(self) -> bgym.Agent:
-        with hydra.initialize(config_path="./conf"):
-            config = hydra.compose(config_name=self.config_name)
+        with hydra.initialize(config_path="../../../conf"):
+            config = hydra.compose(config_name=self.agent_name)
         agent: Agent = hydra.utils.instantiate(config)
         return TapeAgent(agent=agent, tape=Tape(steps=[]))
 
@@ -28,6 +28,11 @@ class TapeAgent(bgym.Agent):
     agent: Agent
     tape: Tape
 
+    def __init__(self, agent: Agent, tape: Tape):
+        super().__init__()
+        self.agent = agent
+        self.tape = tape
+
     def obs_preprocessor(self, obs: dict) -> Any:
         logger.info(f"Preprocessing observation: {obs}")
         return obs
diff --git a/src/agentlab/benchmarks/gaia.py b/src/agentlab/benchmarks/gaia.py
@@ -1,8 +1,11 @@
 import os
+import shutil
 from typing import Any, Literal
 
 import bgym
 import datasets
+from pydantic import Field
+from tapeagents.core import Observation, StopStep, Thought
 from tapeagents.environment import ContainerExecutor
 from tapeagents.tools.browser import Browser
 from tapeagents.tools.code_executor import CodeExecutor
@@ -68,3 +71,60 @@ def init_code_sandbox(self) -> None:
             stop_container=False,
             no_deps=True,
         )
+
+
+class ExtractedFacts(Thought):
+    """
+    Thought that contains the list of facts extracted from the document
+    """
+
+    kind: Literal["extracted_facts_thought"] = "extracted_facts_thought"
+    extracted_facts: list[str] | dict[str, Any] | str = Field(
+        description="facts extracted from the observation"
+    )
+
+
+class GaiaQuestion(Observation):
+    kind: Literal["question"] = "question"
+    content: str
+    filename: str | None = None
+
+    @classmethod
+    def from_task(cls, question: dict):
+        question_prompt = question["Question"]
+        filename = None
+        if question["file_name"]:
+            basename = os.path.basename(question["file_name"])
+            tmp_fname = f"/tmp/{basename}"
+            shutil.copyfile(question["file_name"], tmp_fname)
+            assert os.path.exists(tmp_fname)
+            filename = tmp_fname
+        return cls(content=question_prompt, filename=filename)
+
+
+class GaiaAnswer(StopStep):
+    """
+    Action that indicates the agent has finished the plan and contains the answer or description of failure.
+    The answer should use already determined facts without additional conversion!
+    Your final answer should be a number OR as few words as possible OR a comma-separated list of numbers and/or strings.
+    ADDITIONALLY, your final answer MUST follow any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+    If asked for a number, express it numerically, don't use commas, do not add anything after the number, don't include units such as $ or percent signs unless specified otherwise in the question.
+    If asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+    If asked for a comma-separated list, apply the above rules depending on whether the elements are numbers or strings.
+    If unable to determine the final answer, output an empty string.
+    """
+
+    kind: Literal["gaia_answer_action"] = "gaia_answer_action"
+    success: bool = Field(
+        description="True if the task was successful, False otherwise"
+    )
+    overview: str = Field(
+        description="List of steps performed to answer the question. If the task was not successful, includes the reason for failure"
+    )
+    answer_unit: str = Field(
+        description="Unit of measurement for the answer, if applicable; otherwise an empty string"
+    )
+    answer: Any = Field(description="Short final answer")
+    long_answer: str = Field(
+        description="Detailed final answer not restricted by format rules"
+    )