feat: abort case when the failure is unrecoverable.

seancoding-day · seancoding-day · commit 37d5cae0407f · 2025-09-08T16:06:32.000+08:00
diff --git a/webqa_agent/executor/test_runners.py b/webqa_agent/executor/test_runners.py
@@ -70,6 +70,7 @@ async def run_test(
                     'remaining_objectives': business_objectives,
                     'ui_tester_instance': parallel_tester,
                     'current_test_case_index': 0,
+                    'skip_reflection': False,  # Initialize skip reflection flag
                     'language': test_config.report_config.get('language', 'zh-CN'),
                 }
 
@@ -519,7 +520,7 @@ def __init__(self):
     def _get_text(self, key: str) -> str:
         """Get localized text for the current language."""
         return self.localized_strings.get(self.language, {}).get(key, key)
-    
+
     def get_scan_tags(self, language: str) -> Dict[str, str]:
         """Get scan tags with localized descriptions."""
         return {
diff --git a/webqa_agent/testers/case_gen/agents/execute_agent.py b/webqa_agent/testers/case_gen/agents/execute_agent.py
@@ -347,6 +347,13 @@ def extract_path(u):
                 failed_steps.append(i + 1)
                 logging.warning(f"Step {i+1} detected as failed based on output")
 
+            # Check for critical failures that should immediately stop execution
+            if _is_critical_failure_step(tool_output, instruction_to_execute):
+                failed_steps.append(i + 1)
+                final_summary = f"FINAL_SUMMARY: Critical failure at step {i+1}: '{instruction_to_execute}'. Error details: {tool_output[:200]}..."
+                logging.error(f"Critical failure detected at step {i+1}, aborting remaining steps to save time")
+                break
+
             # Check for max iterations, which indicates a failure to complete the step.
             if "Agent stopped due to max iterations." in tool_output:
                 failed_steps.append(i + 1)
@@ -479,10 +486,17 @@ def extract_path(u):
 
     logging.debug(f"Test case '{case_name}' final status: {status} (success indicators: {has_success}, failure indicators: {has_failure})")
 
+    # Classify failure type if the test case failed
+    failure_type = None
+    if status == "failed":
+        failure_type = _classify_failure_type(final_summary, failed_steps)
+        logging.info(f"Test case '{case_name}' failed with type: {failure_type}")
+
     case_result = {
         "case_name": case_name,
         "final_summary": final_summary,
         "status": status,
+        "failure_type": failure_type,
     }
 
     logging.debug(f"=== Agent Worker Completed for {case_name}. ===")
@@ -491,6 +505,112 @@ def extract_path(u):
     return {"case_result": case_result}
 
 
+def _is_critical_failure_step(tool_output: str, step_instruction: str = "") -> bool:
+    """Check if a single step output indicates a critical failure that should stop execution.
+    
+    Args:
+        tool_output: The output from the step execution
+        step_instruction: The instruction that was executed (for context)
+    
+    Returns:
+        bool: True if this is a critical failure that should stop execution
+    """
+    if not tool_output:
+        return False
+    
+    output_lower = tool_output.lower()
+    
+    # Critical failure patterns for immediate exit
+    critical_step_patterns = [
+        "element not found",
+        "cannot find",
+        "page crashed", 
+        "permission denied",
+        "access denied",
+        "network timeout",
+        "browser error",
+        "navigation failed",
+        "session expired",
+        "server error", 
+        "connection timeout",
+        "unable to load",
+        "page not accessible",
+        "critical error"
+    ]
+    
+    # Check for critical patterns
+    for pattern in critical_step_patterns:
+        if pattern in output_lower:
+            logging.debug(f"Critical failure detected in step: pattern '{pattern}' found")
+            return True
+    
+    return False
+
+
+def _classify_failure_type(final_summary: str, failed_steps: list = None) -> str:
+    """Classify failure type as 'critical' or 'recoverable'.
+    
+    Args:
+        final_summary: The final summary text containing failure information
+        failed_steps: List of failed step numbers
+    
+    Returns:
+        str: 'critical' for unrecoverable failures, 'recoverable' for failures that might be fixed via replan
+    """
+    if not final_summary:
+        return "recoverable"
+    
+    summary_lower = final_summary.lower()
+    
+    # Check for early critical failure exit (from immediate step detection)
+    if "critical failure at step" in summary_lower:
+        logging.debug("Early critical failure exit detected - classified as critical")
+        return "critical"
+    
+    # Critical failure patterns - these indicate unrecoverable issues
+    critical_patterns = [
+        "element not found",
+        "cannot find",
+        "page crashed",
+        "permission denied", 
+        "access denied",
+        "network timeout",
+        "max iterations",
+        "exception:",
+        "cannot proceed",
+        "preamble action",
+        "raised exception",
+        "agent stopped due to max iterations",
+        "element not available",
+        "page not accessible",
+        "browser error",
+        "navigation failed",
+        "session expired",
+        "server error",
+        "connection timeout",
+        "unable to load",
+        "critical error"
+    ]
+    
+    # Check if any critical pattern is present
+    for pattern in critical_patterns:
+        if pattern in summary_lower:
+            logging.debug(f"Critical failure detected: pattern '{pattern}' found in summary")
+            return "critical"
+    
+    # Additional heuristics for critical failures
+    # If too many steps failed, it might indicate a fundamental issue
+    if failed_steps and len(failed_steps) > 0:
+        total_failed = len(failed_steps)
+        if total_failed >= 3:  # If 3 or more steps failed, likely critical
+            logging.debug(f"Critical failure detected: {total_failed} steps failed")
+            return "critical"
+    
+    # Default to recoverable for validation failures, partial failures, etc.
+    logging.debug("Failure classified as recoverable")
+    return "recoverable"
+
+
 def _is_navigation_instruction(instruction: str) -> bool:
     """Determine if the instruction is a navigation instruction.
 
diff --git a/webqa_agent/testers/case_gen/graph.py b/webqa_agent/testers/case_gen/graph.py
@@ -230,6 +230,19 @@ async def reflect_and_replan(state: MainGraphState) -> dict:
     )
     update = {"current_test_case_index": new_index}
 
+    # Check if we should skip reflection due to critical failure
+    if state.get("skip_reflection", False):
+        logging.info("Skipping reflection due to critical failure. Moving directly to next test case.")
+        update["skip_reflection"] = False  # Reset the flag
+        update["reflection_history"] = [
+            {
+                "decision": "CONTINUE",
+                "reasoning": "Critical failure detected in previous test case. Skipping reflection and continuing with next test case to avoid wasting time on unrecoverable errors.",
+                "new_plan": [],
+            }
+        ]
+        return update
+
     # FUSE MECHANISM: Check if the replan limit has been reached.
     MAX_REPLANS = 2
     if state.get("replan_count", 0) >= MAX_REPLANS:
@@ -425,6 +438,17 @@ async def execute_single_case(state: MainGraphState) -> dict:
 
         ui_tester_instance.finish_case(final_status, final_summary)
 
+        # Check if this is a critical failure that should skip reflection
+        if case_result and case_result.get("status") == "failed":
+            failure_type = case_result.get("failure_type")
+            case_name = case_result.get("case_name", "Unknown")
+            
+            if failure_type == "critical":
+                logging.warning(f"Critical failure detected in test case '{case_name}'. Skipping reflection and moving to next case.")
+                return {"completed_cases": [case_result], "skip_reflection": True}
+            else:
+                logging.info(f"Recoverable failure in test case '{case_name}'. Will proceed with reflection for potential replan.")
+
         # Return the single result in a list to be appended by the graph state
         return {"completed_cases": [case_result] if case_result else []}
 
diff --git a/webqa_agent/testers/case_gen/state/schemas.py b/webqa_agent/testers/case_gen/state/schemas.py
@@ -25,3 +25,5 @@ class MainGraphState(TypedDict):
     remaining_objectives: Optional[str]
     ui_tester_instance: Any
     final_report: Optional[dict]
+    # For critical failure handling
+    skip_reflection: bool