Skip to content

Commit 37d5cae

Browse files
feat: abort case when the failure is unrecoverable.
1 parent dd7f642 commit 37d5cae

File tree

4 files changed

+148
-1
lines changed

4 files changed

+148
-1
lines changed

webqa_agent/executor/test_runners.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ async def run_test(
7070
'remaining_objectives': business_objectives,
7171
'ui_tester_instance': parallel_tester,
7272
'current_test_case_index': 0,
73+
'skip_reflection': False, # Initialize skip reflection flag
7374
'language': test_config.report_config.get('language', 'zh-CN'),
7475
}
7576

@@ -519,7 +520,7 @@ def __init__(self):
519520
def _get_text(self, key: str) -> str:
520521
"""Get localized text for the current language."""
521522
return self.localized_strings.get(self.language, {}).get(key, key)
522-
523+
523524
def get_scan_tags(self, language: str) -> Dict[str, str]:
524525
"""Get scan tags with localized descriptions."""
525526
return {

webqa_agent/testers/case_gen/agents/execute_agent.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,13 @@ def extract_path(u):
347347
failed_steps.append(i + 1)
348348
logging.warning(f"Step {i+1} detected as failed based on output")
349349

350+
# Check for critical failures that should immediately stop execution
351+
if _is_critical_failure_step(tool_output, instruction_to_execute):
352+
failed_steps.append(i + 1)
353+
final_summary = f"FINAL_SUMMARY: Critical failure at step {i+1}: '{instruction_to_execute}'. Error details: {tool_output[:200]}..."
354+
logging.error(f"Critical failure detected at step {i+1}, aborting remaining steps to save time")
355+
break
356+
350357
# Check for max iterations, which indicates a failure to complete the step.
351358
if "Agent stopped due to max iterations." in tool_output:
352359
failed_steps.append(i + 1)
@@ -479,10 +486,17 @@ def extract_path(u):
479486

480487
logging.debug(f"Test case '{case_name}' final status: {status} (success indicators: {has_success}, failure indicators: {has_failure})")
481488

489+
# Classify failure type if the test case failed
490+
failure_type = None
491+
if status == "failed":
492+
failure_type = _classify_failure_type(final_summary, failed_steps)
493+
logging.info(f"Test case '{case_name}' failed with type: {failure_type}")
494+
482495
case_result = {
483496
"case_name": case_name,
484497
"final_summary": final_summary,
485498
"status": status,
499+
"failure_type": failure_type,
486500
}
487501

488502
logging.debug(f"=== Agent Worker Completed for {case_name}. ===")
@@ -491,6 +505,112 @@ def extract_path(u):
491505
return {"case_result": case_result}
492506

493507

508+
def _is_critical_failure_step(tool_output: str, step_instruction: str = "") -> bool:
509+
"""Check if a single step output indicates a critical failure that should stop execution.
510+
511+
Args:
512+
tool_output: The output from the step execution
513+
step_instruction: The instruction that was executed (for context)
514+
515+
Returns:
516+
bool: True if this is a critical failure that should stop execution
517+
"""
518+
if not tool_output:
519+
return False
520+
521+
output_lower = tool_output.lower()
522+
523+
# Critical failure patterns for immediate exit
524+
critical_step_patterns = [
525+
"element not found",
526+
"cannot find",
527+
"page crashed",
528+
"permission denied",
529+
"access denied",
530+
"network timeout",
531+
"browser error",
532+
"navigation failed",
533+
"session expired",
534+
"server error",
535+
"connection timeout",
536+
"unable to load",
537+
"page not accessible",
538+
"critical error"
539+
]
540+
541+
# Check for critical patterns
542+
for pattern in critical_step_patterns:
543+
if pattern in output_lower:
544+
logging.debug(f"Critical failure detected in step: pattern '{pattern}' found")
545+
return True
546+
547+
return False
548+
549+
550+
def _classify_failure_type(final_summary: str, failed_steps: list = None) -> str:
551+
"""Classify failure type as 'critical' or 'recoverable'.
552+
553+
Args:
554+
final_summary: The final summary text containing failure information
555+
failed_steps: List of failed step numbers
556+
557+
Returns:
558+
str: 'critical' for unrecoverable failures, 'recoverable' for failures that might be fixed via replan
559+
"""
560+
if not final_summary:
561+
return "recoverable"
562+
563+
summary_lower = final_summary.lower()
564+
565+
# Check for early critical failure exit (from immediate step detection)
566+
if "critical failure at step" in summary_lower:
567+
logging.debug("Early critical failure exit detected - classified as critical")
568+
return "critical"
569+
570+
# Critical failure patterns - these indicate unrecoverable issues
571+
critical_patterns = [
572+
"element not found",
573+
"cannot find",
574+
"page crashed",
575+
"permission denied",
576+
"access denied",
577+
"network timeout",
578+
"max iterations",
579+
"exception:",
580+
"cannot proceed",
581+
"preamble action",
582+
"raised exception",
583+
"agent stopped due to max iterations",
584+
"element not available",
585+
"page not accessible",
586+
"browser error",
587+
"navigation failed",
588+
"session expired",
589+
"server error",
590+
"connection timeout",
591+
"unable to load",
592+
"critical error"
593+
]
594+
595+
# Check if any critical pattern is present
596+
for pattern in critical_patterns:
597+
if pattern in summary_lower:
598+
logging.debug(f"Critical failure detected: pattern '{pattern}' found in summary")
599+
return "critical"
600+
601+
# Additional heuristics for critical failures
602+
# If too many steps failed, it might indicate a fundamental issue
603+
if failed_steps and len(failed_steps) > 0:
604+
total_failed = len(failed_steps)
605+
if total_failed >= 3: # If 3 or more steps failed, likely critical
606+
logging.debug(f"Critical failure detected: {total_failed} steps failed")
607+
return "critical"
608+
609+
# Default to recoverable for validation failures, partial failures, etc.
610+
logging.debug("Failure classified as recoverable")
611+
return "recoverable"
612+
613+
494614
def _is_navigation_instruction(instruction: str) -> bool:
495615
"""Determine if the instruction is a navigation instruction.
496616

webqa_agent/testers/case_gen/graph.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,19 @@ async def reflect_and_replan(state: MainGraphState) -> dict:
230230
)
231231
update = {"current_test_case_index": new_index}
232232

233+
# Check if we should skip reflection due to critical failure
234+
if state.get("skip_reflection", False):
235+
logging.info("Skipping reflection due to critical failure. Moving directly to next test case.")
236+
update["skip_reflection"] = False # Reset the flag
237+
update["reflection_history"] = [
238+
{
239+
"decision": "CONTINUE",
240+
"reasoning": "Critical failure detected in previous test case. Skipping reflection and continuing with next test case to avoid wasting time on unrecoverable errors.",
241+
"new_plan": [],
242+
}
243+
]
244+
return update
245+
233246
# FUSE MECHANISM: Check if the replan limit has been reached.
234247
MAX_REPLANS = 2
235248
if state.get("replan_count", 0) >= MAX_REPLANS:
@@ -425,6 +438,17 @@ async def execute_single_case(state: MainGraphState) -> dict:
425438

426439
ui_tester_instance.finish_case(final_status, final_summary)
427440

441+
# Check if this is a critical failure that should skip reflection
442+
if case_result and case_result.get("status") == "failed":
443+
failure_type = case_result.get("failure_type")
444+
case_name = case_result.get("case_name", "Unknown")
445+
446+
if failure_type == "critical":
447+
logging.warning(f"Critical failure detected in test case '{case_name}'. Skipping reflection and moving to next case.")
448+
return {"completed_cases": [case_result], "skip_reflection": True}
449+
else:
450+
logging.info(f"Recoverable failure in test case '{case_name}'. Will proceed with reflection for potential replan.")
451+
428452
# Return the single result in a list to be appended by the graph state
429453
return {"completed_cases": [case_result] if case_result else []}
430454

webqa_agent/testers/case_gen/state/schemas.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@ class MainGraphState(TypedDict):
2525
remaining_objectives: Optional[str]
2626
ui_tester_instance: Any
2727
final_report: Optional[dict]
28+
# For critical failure handling
29+
skip_reflection: bool

0 commit comments

Comments
 (0)