Skip to content

Commit 8a882ad

Browse files
committed
pipeline mvp
1 parent 46d2c8c commit 8a882ad

File tree

3 files changed

+47
-35
lines changed

3 files changed

+47
-35
lines changed

src/agentlab/analyze/error_analysis/pipeline.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66

77
from bgym import ExpResult
88

9-
from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer
9+
from agentlab.analyze.error_analysis.summarizer import (
10+
ChangeSummarizer,
11+
EpisodeErrorSummarizer,
12+
EpisodeSummarizer,
13+
)
1014
from agentlab.analyze.inspect_results import yield_all_exp_results
1115

1216

@@ -24,7 +28,6 @@ class ErrorAnalysisPipeline:
2428
exp_dir: Path
2529
filter: str = None
2630
episode_summarizer: EpisodeSummarizer = None
27-
analyzer: Analyzer = None
2831

2932
def filter_exp_results(self) -> Generator[ExpResult, None, None]:
3033
# TODO:(thibault) improve filtering
@@ -37,23 +40,16 @@ def run_analysis(self):
3740
filtered_results = self.filter_exp_results()
3841

3942
for exp_result in filtered_results:
40-
episode_summary = self.episode_summarizer(exp_result)
41-
error_analysis = self.analyze_errors(exp_result, episode_summary)
43+
error_analysis = self.episode_summarizer(exp_result)
4244
self.save_analysis(exp_result, error_analysis)
4345

44-
def analyze_errors(
45-
self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
46-
) -> str:
47-
error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis)
48-
return error_analysis
49-
5046
def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
5147
"""Save the analysis to json"""
5248
analysis_path = exp_result.exp_dir / "error_analysis.json"
5349
if not exists_ok and analysis_path.exists():
5450
raise FileExistsError(f"{analysis_path} already exists")
5551
with analysis_path.open("w") as f:
56-
json.dump(error_analysis, f)
52+
json.dump(error_analysis, f, indent=4)
5753

5854

5955
if __name__ == "__main__":
@@ -67,8 +63,6 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
6763
exp_dir = Path(args.exp_dir)
6864
filter = args.filter
6965

70-
import openai
71-
7266
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
7367

7468
llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
@@ -79,9 +73,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
7973
pipeline = ErrorAnalysisPipeline(
8074
exp_dir=exp_dir,
8175
filter=filter,
82-
episode_summarizer=EpisodeSummarizer(),
83-
step_summarizer=ChangeSummarizer(),
84-
analyzer=Analyzer("prompt"),
76+
episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm),
8577
)
8678

8779
pipeline.run_analysis()

src/agentlab/analyze/error_analysis/summarizer.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
ERROR_CLASSIFICATION_PROMPT,
88
)
99
from agentlab.analyze.inspect_results import summarize
10+
from agentlab.llm.llm_utils import json_parser
1011

1112

1213
def _diff(past_obs, current_obs):
@@ -21,7 +22,7 @@ def _diff(past_obs, current_obs):
2122
class ChangeSummarizer:
2223

2324
llm: callable # language model
24-
obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available")
25+
obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
2526
use_diff: bool = False
2627

2728
def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
@@ -74,20 +75,35 @@ class EpisodeAnalysis:
7475
class EpisodeSummarizer:
7576

7677
change_summarizer: ChangeSummarizer = None
78+
llm: callable = None
79+
parser: callable = lambda x: json_parser(x)[0]
7780

7881
def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
7982

8083
def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
8184
"""Run Change Summarizer for every step in the episode or extract a pre-computed one."""
85+
86+
if exp_results.steps_info[-1].reward == 1:
87+
return {"analysis": "Success", "summaries": {}}
88+
8289
summaries = self.make_change_summaries(exp_results)
90+
prompt = self.make_prompt(exp_results, summaries)
91+
raw_analysis = self.llm(prompt)["content"]
92+
analysis = self.parser(raw_analysis)
93+
return {
94+
"analysis": analysis,
95+
"summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
96+
}
8397

8498
def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
8599
summaries = [] # type: list[str]
86100
# this assumes that there is always an extra step at the end of the episode
87101
# it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
88102
# TODO:(thibault) make some checks or w/e
89103
for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
90-
summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
104+
summaries.append(
105+
self.change_summarizer.summarize(step, next_step, summaries)["content"]
106+
)
91107
return summaries
92108

93109

@@ -96,12 +112,26 @@ class EpisodeErrorSummarizer(EpisodeSummarizer):
96112

97113
change_summarizer: ChangeSummarizer = None
98114

99-
def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
115+
def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
100116
"""TODO: Implement the prompt."""
117+
goal = exp_results.steps_info[0].obs["goal"]
118+
119+
txt_summaries = "\n".join(summaries)
120+
121+
thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
122+
actions = [step.action for step in exp_results.steps_info[:-1]]
123+
action_errors = "\n".join(
124+
[step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
125+
)
126+
127+
txt_actions = "\n".join(
128+
[
129+
f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}"
130+
for action, thought, action_error in zip(actions, thoughts, action_errors)
131+
]
132+
)
101133
return ERROR_CLASSIFICATION_PROMPT.format(
102134
goal=goal,
103-
plan=plan,
104-
current_observation=current_observation,
105-
historical_summaries=historical_summaries,
106-
action_history=action_history,
135+
historical_summaries=txt_summaries,
136+
action_history=txt_actions,
107137
)

src/agentlab/analyze/error_analysis/summarizer_prompts.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -110,17 +110,11 @@
110110
You will receive the following for each scenario:
111111
1. User Goal
112112
- The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
113-
114-
2. Planning / Thought History
115-
- The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
116113
117-
3. Current Observation (HTML / AX Tree Snippet)
118-
- The webpage structure or state that the agent sees at a given point in time.
119-
120-
4. Historical change summaries
114+
2. Historical change summaries
121115
- A list of summaries of changes in the observation that the agent has seen during the course of actions.
122116
123-
5. Action History
117+
3. Action History
124118
- A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.)
125119
along with immediate outcomes or errors.
126120
@@ -192,10 +186,6 @@
192186
193187
Overall goal: {goal}
194188
195-
LLM Plan and thought history: {plan}
196-
197-
Current Observation: {current_observation}
198-
199189
Historical change summaries: {historical_summaries}
200190
201191
Action history: {action_history}

0 commit comments

Comments
 (0)