pipeline mvp

TLSDC · TLSDC · commit 8a882ad58d23 · 2025-01-28T11:09:53.000-05:00
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -6,7 +6,11 @@
 
 from bgym import ExpResult
 
-from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer
+from agentlab.analyze.error_analysis.summarizer import (
+    ChangeSummarizer,
+    EpisodeErrorSummarizer,
+    EpisodeSummarizer,
+)
 from agentlab.analyze.inspect_results import yield_all_exp_results
 
 
@@ -24,7 +28,6 @@ class ErrorAnalysisPipeline:
     exp_dir: Path
     filter: str = None
     episode_summarizer: EpisodeSummarizer = None
-    analyzer: Analyzer = None
 
     def filter_exp_results(self) -> Generator[ExpResult, None, None]:
         # TODO:(thibault) improve filtering
@@ -37,23 +40,16 @@ def run_analysis(self):
         filtered_results = self.filter_exp_results()
 
         for exp_result in filtered_results:
-            episode_summary = self.episode_summarizer(exp_result)
-            error_analysis = self.analyze_errors(exp_result, episode_summary)
+            error_analysis = self.episode_summarizer(exp_result)
             self.save_analysis(exp_result, error_analysis)
 
-    def analyze_errors(
-        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
-    ) -> str:
-        error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis)
-        return error_analysis
-
     def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
         """Save the analysis to json"""
         analysis_path = exp_result.exp_dir / "error_analysis.json"
         if not exists_ok and analysis_path.exists():
             raise FileExistsError(f"{analysis_path} already exists")
         with analysis_path.open("w") as f:
-            json.dump(error_analysis, f)
+            json.dump(error_analysis, f, indent=4)
 
 
 if __name__ == "__main__":
@@ -67,8 +63,6 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     exp_dir = Path(args.exp_dir)
     filter = args.filter
 
-    import openai
-
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
     llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
@@ -79,9 +73,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeSummarizer(),
-        step_summarizer=ChangeSummarizer(),
-        analyzer=Analyzer("prompt"),
+        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm),
     )
 
     pipeline.run_analysis()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -7,6 +7,7 @@
     ERROR_CLASSIFICATION_PROMPT,
 )
 from agentlab.analyze.inspect_results import summarize
+from agentlab.llm.llm_utils import json_parser
 
 
 def _diff(past_obs, current_obs):
@@ -21,7 +22,7 @@ def _diff(past_obs, current_obs):
 class ChangeSummarizer:
 
     llm: callable  # language model
-    obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available")
+    obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
     use_diff: bool = False
 
     def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
@@ -74,20 +75,35 @@ class EpisodeAnalysis:
 class EpisodeSummarizer:
 
     change_summarizer: ChangeSummarizer = None
+    llm: callable = None
+    parser: callable = lambda x: json_parser(x)[0]
 
     def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
 
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+
+        if exp_results.steps_info[-1].reward == 1:
+            return {"analysis": "Success", "summaries": {}}
+
         summaries = self.make_change_summaries(exp_results)
+        prompt = self.make_prompt(exp_results, summaries)
+        raw_analysis = self.llm(prompt)["content"]
+        analysis = self.parser(raw_analysis)
+        return {
+            "analysis": analysis,
+            "summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
+        }
 
     def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         summaries = []  # type: list[str]
         # this assumes that there is always an extra step at the end of the episode
         # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
         # TODO:(thibault) make some checks or w/e
         for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
-            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+            summaries.append(
+                self.change_summarizer.summarize(step, next_step, summaries)["content"]
+            )
         return summaries
 
 
@@ -96,12 +112,26 @@ class EpisodeErrorSummarizer(EpisodeSummarizer):
 
     change_summarizer: ChangeSummarizer = None
 
-    def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
         """TODO: Implement the prompt."""
+        goal = exp_results.steps_info[0].obs["goal"]
+
+        txt_summaries = "\n".join(summaries)
+
+        thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
+        actions = [step.action for step in exp_results.steps_info[:-1]]
+        action_errors = "\n".join(
+            [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
+        )
+
+        txt_actions = "\n".join(
+            [
+                f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}"
+                for action, thought, action_error in zip(actions, thoughts, action_errors)
+            ]
+        )
         return ERROR_CLASSIFICATION_PROMPT.format(
             goal=goal,
-            plan=plan,
-            current_observation=current_observation,
-            historical_summaries=historical_summaries,
-            action_history=action_history,
+            historical_summaries=txt_summaries,
+            action_history=txt_actions,
         )
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -110,17 +110,11 @@
 You will receive the following for each scenario:
 1. User Goal
    - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
-   
-2. Planning / Thought History
-   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
 
-3. Current Observation (HTML / AX Tree Snippet)
-   - The webpage structure or state that the agent sees at a given point in time.
-
-4. Historical change summaries
+2. Historical change summaries
    - A list of summaries of changes in the observation that the agent has seen during the course of actions.
 
-5. Action History
+3. Action History
    - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
      along with immediate outcomes or errors.
 
@@ -192,10 +186,6 @@
 
 Overall goal: {goal}
 
-LLM Plan and thought history: {plan}
-
-Current Observation: {current_observation}
-
 Historical change summaries: {historical_summaries}
 
 Action history: {action_history}