Skip to content

Commit 5bf1bac

Browse files
committed
added flag to oracle success or no
1 parent 3a3d602 commit 5bf1bac

File tree

3 files changed

+130
-4
lines changed

3 files changed

+130
-4
lines changed

src/agentlab/analyze/error_analysis/pipeline.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def main():
7878
parser.add_argument("-f", "--filter", type=str, default=None)
7979
parser.add_argument("-p", "--parallel", action="store_true")
8080
parser.add_argument("-j", "--jobs", type=int, default=-1)
81+
parser.add_argument("-g", "--guess_success", action="store_true")
8182

8283
args = parser.parse_args()
8384

@@ -87,6 +88,7 @@ def main():
8788
filter = args.filter
8889
parallel = args.parallel
8990
jobs = args.jobs
91+
guess_success = args.guess_success
9092

9193
from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
9294

@@ -95,7 +97,9 @@ def main():
9597
pipeline = ErrorAnalysisPipeline(
9698
exp_dir=exp_dir,
9799
filter=filter,
98-
episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
100+
episode_summarizer=EpisodeErrorSummarizer(
101+
ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
102+
),
99103
)
100104

101105
pipeline.run_analysis(parallel=parallel, jobs=jobs)

src/agentlab/analyze/error_analysis/summarizer.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from agentlab.analyze.error_analysis.summarizer_prompts import (
66
CHANGE_SUMMARIZER_PROMPT,
77
ERROR_CLASSIFICATION_PROMPT,
8+
ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
89
)
910
from agentlab.llm.llm_utils import json_parser, parse_html_tags
1011
from agentlab.llm.tracking import set_tracker
@@ -85,14 +86,16 @@ class EpisodeSummarizer:
8586
change_summarizer: ChangeSummarizer = None
8687
llm: callable = None
8788
parser: callable = lambda x: json_parser(x)[0]
89+
guess_success: bool = False
8890

8991
def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
9092

9193
def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
9294
"""Run Change Summarizer for every step in the episode or extract a pre-computed one."""
9395

94-
if exp_results.steps_info[-1].reward == 1:
95-
return {"analysis": "Success", "summaries": {}}
96+
if not self.guess_success:
97+
if exp_results.steps_info[-1].reward == 1:
98+
return {"analysis": "Success", "summaries": {}}
9699

97100
with set_tracker("summary") as summaries_tracker:
98101
summaries = self.make_change_summaries(exp_results)
@@ -154,7 +157,13 @@ def format_summary(summary):
154157

155158
extra_info = exp_results.steps_info[-1].task_info
156159

157-
return ERROR_CLASSIFICATION_PROMPT.format(
160+
prompt = (
161+
ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
162+
if self.guess_success
163+
else ERROR_CLASSIFICATION_PROMPT
164+
)
165+
166+
return prompt.format(
158167
goal=goal,
159168
historical_summaries=txt_summaries,
160169
action_history=txt_actions,

src/agentlab/analyze/error_analysis/summarizer_prompts.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,119 @@
5050
Action: {action}
5151
"""
5252

53+
ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """
54+
You are an expert evaluator that classifies web agent failures according to a predefined taxonomy.
55+
Below are the high-level definitions of each category,
56+
followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.),
57+
a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
58+
59+
--------------------------------------------------------------------------------
60+
TAXONOMY DEFINITIONS
61+
--------------------------------------------------------------------------------
62+
63+
1. Navigation & Planning Errors
64+
The agent cannot construct or execute a correct sequence of actions to reach its goal
65+
(e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
66+
67+
2. Interaction Execution Errors
68+
The agent enters data in the wrong format, forgets to click "Submit" after typing,
69+
repeats the same failing action without adaptation, or loses track of the changing webpage state.
70+
71+
3. Information Processing Errors
72+
The agent misreads or misinterprets visible data (e.g., extracting the wrong field values),
73+
misconstrues relationships between pieces of information, or fails to validate data against task requirements.
74+
75+
4. Observation & Action Errors
76+
The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
77+
or misaligns its actions (clicks the wrong element or stale link).
78+
79+
5. Task Understanding Errors
80+
The agent misreads or misunderstands the user's objective (goal interpretation),
81+
loses crucial context (context loss), or performs actions beyond or short of the intended scope.
82+
83+
6. Reasoning Failures
84+
The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps,
85+
or fails to prioritize important subtasks when handling complex goals.
86+
87+
--------------------------------------------------------------------------------
88+
INPUT DESCRIPTION
89+
--------------------------------------------------------------------------------
90+
91+
You will receive the following for each scenario:
92+
1. User Goal
93+
- The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
94+
95+
2. Historical change summaries
96+
- A list of summaries of changes in the observation that the agent has seen during the course of actions.
97+
98+
3. Action History
99+
- A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.)
100+
along with immediate outcomes or errors.
101+
102+
Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
103+
104+
--------------------------------------------------------------------------------
105+
FEW-SHOT CLASSIFICATION EXAMPLES
106+
--------------------------------------------------------------------------------
107+
108+
1) EXAMPLE A (Interaction Execution)
109+
• Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format.
110+
Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
111+
• Classification: ["Interaction Execution"]
112+
• Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action
113+
without adaptation ("Action Repetition").
114+
115+
2) EXAMPLE B (Task Understanding)
116+
• Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted'
117+
that are older than 30 days and add a comment saying 'I can help fix this.'"
118+
The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue
119+
with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
120+
• Classification: ["Task Understanding"]
121+
• Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues,
122+
it focused on creating a new issue. This is a misinterpretation of the instructions,
123+
not a mechanical error in clicking or input format.
124+
125+
--------------------------------------------------------------------------------
126+
CLASSIFICATION TASK
127+
--------------------------------------------------------------------------------
128+
129+
1. Read through:
130+
- The planning and thought history
131+
- The action history
132+
- The current HTML or AX Tree observation
133+
- The user goal
134+
135+
2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
136+
If the task is successful, you can keep the error category as blank.
137+
138+
3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
139+
140+
Output format example for an unsuccessful interaction:
141+
142+
<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
143+
<success>False</success>
144+
<errorCategory>["Navigation & Planning"]</errorCategory>
145+
146+
Output format example for a successful interaction:
147+
148+
<explanation>The agent opened the correct GitLab page and ...</explanation>
149+
<success>True</success>
150+
<errorCategory>[]</errorCategory>
151+
152+
Please follow this structure at every step. Keep your responses concise and clear.
153+
154+
Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
155+
156+
Overall goal: {goal}
157+
158+
Historical change summaries: {historical_summaries}
159+
160+
Action history: {action_history}
161+
162+
Extra information: {extra_info}
163+
"""
164+
165+
53166
ERROR_CLASSIFICATION_PROMPT = """
54167
You are an expert evaluator that classifies web agent failures according to a predefined taxonomy.
55168
Below are the high-level definitions of each category,

0 commit comments

Comments
 (0)