added flag to oracle success or no

TLSDC · TLSDC · commit 5bf1bac08a99 · 2025-02-20T16:14:12.000-05:00
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -78,6 +78,7 @@ def main():
     parser.add_argument("-f", "--filter", type=str, default=None)
     parser.add_argument("-p", "--parallel", action="store_true")
     parser.add_argument("-j", "--jobs", type=int, default=-1)
+    parser.add_argument("-g", "--guess_success", action="store_true")
 
     args = parser.parse_args()
 
@@ -87,6 +88,7 @@ def main():
     filter = args.filter
     parallel = args.parallel
     jobs = args.jobs
+    guess_success = args.guess_success
 
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
@@ -95,7 +97,9 @@ def main():
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
+        episode_summarizer=EpisodeErrorSummarizer(
+            ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
+        ),
     )
 
     pipeline.run_analysis(parallel=parallel, jobs=jobs)
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -5,6 +5,7 @@
 from agentlab.analyze.error_analysis.summarizer_prompts import (
     CHANGE_SUMMARIZER_PROMPT,
     ERROR_CLASSIFICATION_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
 )
 from agentlab.llm.llm_utils import json_parser, parse_html_tags
 from agentlab.llm.tracking import set_tracker
@@ -85,14 +86,16 @@ class EpisodeSummarizer:
     change_summarizer: ChangeSummarizer = None
     llm: callable = None
     parser: callable = lambda x: json_parser(x)[0]
+    guess_success: bool = False
 
     def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
 
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
 
-        if exp_results.steps_info[-1].reward == 1:
-            return {"analysis": "Success", "summaries": {}}
+        if not self.guess_success:
+            if exp_results.steps_info[-1].reward == 1:
+                return {"analysis": "Success", "summaries": {}}
 
         with set_tracker("summary") as summaries_tracker:
             summaries = self.make_change_summaries(exp_results)
@@ -154,7 +157,13 @@ def format_summary(summary):
 
         extra_info = exp_results.steps_info[-1].task_info
 
-        return ERROR_CLASSIFICATION_PROMPT.format(
+        prompt = (
+            ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
+            if self.guess_success
+            else ERROR_CLASSIFICATION_PROMPT
+        )
+
+        return prompt.format(
             goal=goal,
             historical_summaries=txt_summaries,
             action_history=txt_actions,
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -50,6 +50,119 @@
 Action: {action}
 """
 
+ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+  The agent cannot construct or execute a correct sequence of actions to reach its goal 
+  (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+  The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+  repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+  The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+  misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+  The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+  or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+  The agent misreads or misunderstands the user's objective (goal interpretation), 
+  loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+  The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+  or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
+   If the task is successful, you can keep the error category as blank.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an unsuccessful interaction:
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<success>False</success>
+<errorCategory>["Navigation & Planning"]</errorCategory>
+
+Output format example for a successful interaction:
+
+<explanation>The agent opened the correct GitLab page and ...</explanation>
+<success>True</success>
+<errorCategory>[]</errorCategory>
+  
+Please follow this structure at every step. Keep your responses concise and clear. 
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
+
+
 ERROR_CLASSIFICATION_PROMPT = """
 You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
 Below are the high-level definitions of each category,