Skip to content

Commit 071076c

Browse files
feat: checkpoint2
1 parent df9a857 commit 071076c

File tree

4 files changed

+563
-236
lines changed

4 files changed

+563
-236
lines changed

webqa_agent/testers/case_gen/agents/execute_agent.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,12 @@ async def generate_dynamic_steps_with_llm(
279279
reason = result.get("reason", "No reason provided")
280280
steps = result.get("steps", [])
281281

282-
# Extract and validate analysis fields (QAG format)
282+
# Extract and validate analysis fields (Enhanced QAG format)
283283
analysis = result.get("analysis", {})
284284
q1_can_complete_alone = analysis.get("q1_can_complete_alone", False) if isinstance(analysis, dict) else False
285285
q2_different_aspects = analysis.get("q2_different_aspects", False) if isinstance(analysis, dict) else False
286286
q3_remaining_redundant = analysis.get("q3_remaining_redundant", False) if isinstance(analysis, dict) else False
287+
q4_abstraction_gap = analysis.get("q4_abstraction_gap", False) if isinstance(analysis, dict) else False
287288

288289
# Validate strategy value
289290
if strategy not in ["insert", "replace"]:
@@ -303,6 +304,10 @@ async def generate_dynamic_steps_with_llm(
303304
logging.debug(f"Invalid q3_remaining_redundant {q3_remaining_redundant}, defaulting to False")
304305
q3_remaining_redundant = False
305306

307+
if not isinstance(q4_abstraction_gap, bool):
308+
logging.debug(f"Invalid q4_abstraction_gap {q4_abstraction_gap}, defaulting to False")
309+
q4_abstraction_gap = False
310+
306311
# Validate and limit step count
307312
valid_steps = []
308313
if isinstance(steps, list):
@@ -315,7 +320,7 @@ async def generate_dynamic_steps_with_llm(
315320

316321
logging.debug(f"Strategy reason: {reason}")
317322
if analysis:
318-
logging.debug(f"QAG Analysis: q1_can_complete_alone={q1_can_complete_alone}, q2_different_aspects={q2_different_aspects}, q3_remaining_redundant={q3_remaining_redundant}")
323+
logging.debug(f"Enhanced QAG Analysis: q1_can_complete_alone={q1_can_complete_alone}, q2_different_aspects={q2_different_aspects}, q3_remaining_redundant={q3_remaining_redundant}, q4_abstraction_gap={q4_abstraction_gap}")
319324

320325
# Return enhanced result with QAG analysis
321326
result_data = {
@@ -324,12 +329,13 @@ async def generate_dynamic_steps_with_llm(
324329
"steps": valid_steps
325330
}
326331

327-
# Include QAG analysis if provided
332+
# Include Enhanced QAG analysis if provided
328333
if analysis:
329334
result_data["analysis"] = {
330335
"q1_can_complete_alone": q1_can_complete_alone,
331336
"q2_different_aspects": q2_different_aspects,
332-
"q3_remaining_redundant": q3_remaining_redundant
337+
"q3_remaining_redundant": q3_remaining_redundant,
338+
"q4_abstraction_gap": q4_abstraction_gap
333339
}
334340

335341
return result_data
@@ -572,6 +578,7 @@ def extract_path(u):
572578
failed_steps = [] # Track failed steps for summary generation
573579
case_modified = False # Track if case was modified with dynamic steps
574580
dynamic_generation_count = 0 # Track how many times dynamic generation occurred
581+
dom_diff_cache = []
575582

576583
for i, step in enumerate(case.get("steps", [])):
577584
instruction_to_execute = step.get("action") or step.get("verify")
@@ -692,10 +699,16 @@ def extract_path(u):
692699
logging.error(f"Step {i+1} failed due to max iterations.")
693700
break
694701

702+
# Check for objective achievement signal
703+
is_achieved, achievement_reason = _is_objective_achieved(tool_output)
704+
if is_achieved:
705+
logging.info(f"Test objective achieved at step {i+1}: {achievement_reason}")
706+
final_summary = f"FINAL_SUMMARY: Test case completed successfully with early termination at step {i+1}. {achievement_reason}"
707+
break
708+
695709
logging.debug(f"Step {i+1} completed {'successfully' if (i+1) not in failed_steps else 'with issues'}.")
696710

697711
# --- Dynamic Step Generation ---
698-
# Check if dynamic step generation is enabled and current step succeeded
699712
if step_type == "Action":
700713
# Get dynamic step generation config from state
701714
dynamic_config = state.get("dynamic_step_generation", {
@@ -712,7 +725,7 @@ def extract_path(u):
712725
# Extract DOM diff from tool output
713726
dom_diff = extract_dom_diff_from_output(result['intermediate_steps'][0][1])
714727

715-
if dom_diff and len(dom_diff) >= min_elements_threshold:
728+
if dom_diff and len(dom_diff) >= min_elements_threshold and dom_diff not in dom_diff_cache:
716729
logging.info(f"Detected {len(dom_diff)} new elements, starting dynamic test step generation")
717730

718731
try:
@@ -817,6 +830,8 @@ def is_similar_step(step1: dict, step2: dict) -> bool:
817830
logging.debug(f"Detected {len(dom_diff)} new elements, but below threshold {min_elements_threshold}, skipping dynamic step generation")
818831
else:
819832
logging.debug("No DOM changes detected, skipping dynamic step generation")
833+
dom_diff_cache.append(dom_diff)
834+
820835
else:
821836
logging.debug("Dynamic step generation not enabled")
822837
# --- Dynamic Step Generation End ---
@@ -968,6 +983,32 @@ def is_similar_step(step1: dict, step2: dict) -> bool:
968983
return result
969984

970985

986+
def _is_objective_achieved(tool_output: str) -> tuple[bool, str]:
987+
"""Check if the agent has signaled that the test objective is achieved.
988+
989+
Args:
990+
tool_output: The output from the step execution
991+
992+
Returns:
993+
tuple: (is_achieved: bool, reason: str)
994+
"""
995+
if not tool_output or "OBJECTIVE_ACHIEVED:" not in tool_output:
996+
return False, ""
997+
998+
try:
999+
# Extract the reason after the signal
1000+
parts = tool_output.split("OBJECTIVE_ACHIEVED:")
1001+
if len(parts) > 1:
1002+
reason = parts[1].split("\n")[0].strip()
1003+
# Only return True if there's actual content after the signal
1004+
if reason:
1005+
return True, reason
1006+
except Exception as e:
1007+
logging.debug(f"Error parsing objective achievement signal: {e}")
1008+
1009+
return False, ""
1010+
1011+
9711012
def _is_critical_failure_step(tool_output: str, step_instruction: str = "") -> bool:
9721013
"""Check if a single step output indicates a critical failure that should stop execution.
9731014

webqa_agent/testers/case_gen/prompts/agent_prompts.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ def get_execute_system_prompt(case: dict) -> str:
77
# Core fields (original)
88
objective = case.get("objective", "Not specified")
99
success_criteria = case.get("success_criteria", ["Not specified"])
10-
steps_list = case.get("steps", [])
1110

1211
# Enhanced fields (new)
1312
priority = case.get("priority", "Medium")
@@ -16,14 +15,6 @@ def get_execute_system_prompt(case: dict) -> str:
1615
domain_specific_rules = case.get("domain_specific_rules", "")
1716
test_data_requirements = case.get("test_data_requirements", "")
1817

19-
# Format step information
20-
formatted_steps = []
21-
for i, step in enumerate(steps_list):
22-
if "action" in step:
23-
formatted_steps.append(f"{i+1}. Action: {step['action']}")
24-
elif "verify" in step:
25-
formatted_steps.append(f"{i+1}. Assert: {step['verify']}")
26-
2718
system_prompt = f"""You are an intelligent UI test execution agent specialized in web application testing. Your role is to execute individual test cases by performing UI interactions and validations in a systematic, reliable manner following established QA best practices.
2819
2920
## Core Mission
@@ -139,7 +130,28 @@ def get_execute_system_prompt(case: dict) -> str:
139130
- UI state errors: Navigate back to expected state
140131
4. **Resume test plan** only after successful error resolution
141132
142-
### 3. Test Plan Adherence (THIRD PRIORITY)
133+
### 3. Objective Achievement Detection (THIRD PRIORITY)
134+
**Critical Rule**: After completing each step, evaluate whether the test objective has been fully achieved.
135+
If the objective is complete and remaining steps would be redundant, signal early completion.
136+
137+
**Objective Achievement Criteria**:
138+
- All success criteria have been validated through executed actions
139+
- Core functionality has been thoroughly tested and verified
140+
- Remaining steps would provide no additional value or coverage
141+
- The test objective is comprehensively fulfilled based on actual results
142+
143+
**Early Completion Signal Format**:
144+
When you determine the test objective is achieved, output this exact signal:
145+
`OBJECTIVE_ACHIEVED: Test objective "[objective]" completed at step [X]. Remaining [Y] steps are redundant. Reason: [detailed explanation of why objective is complete and remaining steps unnecessary].`
146+
147+
**Decision Guidelines**:
148+
- **Be Conservative**: Only signal when absolutely certain objective is achieved
149+
- **Evaluate Coverage**: Consider if remaining steps test unique aspects not yet covered
150+
- **Base on Results**: Evaluate based on actual execution results, not assumptions
151+
- **Dynamic Context**: This is especially relevant after dynamic steps that may have covered the original test intent
152+
- **Unique Value Assessment**: Focus on whether remaining steps add genuine testing value
153+
154+
### 4. Test Plan Adherence (FOURTH PRIORITY)
143155
**Execution Strategy**:
144156
- Execute test steps in the defined sequence
145157
- Use appropriate tools based on step type:
@@ -148,8 +160,8 @@ def get_execute_system_prompt(case: dict) -> str:
148160
- Maintain clear action descriptions for test documentation
149161
- Track progress through the test plan systematically
150162
151-
### 4. Test Objective Achievement (FOURTH PRIORITY)
152-
**Goal-Oriented Execution**:
163+
### 5. Adaptive Goal Execution (FIFTH PRIORITY)
164+
**Goal-Oriented Adaptation**:
153165
- Keep the test objective as the ultimate success criterion
154166
- If the standard test steps cannot achieve the objective due to UI changes, adapt the approach while maintaining test integrity
155167
- Document any deviations from the planned approach with clear justification

0 commit comments

Comments
 (0)