Skip to content

Commit df9a857

Browse files
feat: checkpoint
1 parent 09d9d7c commit df9a857

File tree

2 files changed

+284
-546
lines changed

2 files changed

+284
-546
lines changed

webqa_agent/testers/case_gen/agents/execute_agent.py

Lines changed: 109 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from webqa_agent.testers.case_gen.utils.message_converter import convert_intermediate_steps_to_messages
2222
from webqa_agent.utils.log_icon import icon
2323

24+
LONG_STEPS = 10
2425

2526
# ============================================================================
2627
# Dynamic Step Generation Helper Functions
@@ -126,19 +127,23 @@ async def generate_dynamic_steps_with_llm(
126127
max_steps: int,
127128
llm: any,
128129
current_case: dict = None,
129-
screenshot: str = None
130+
screenshot: str = None,
131+
tool_output: str = None,
132+
step_success: bool = True
130133
) -> dict:
131134
"""Generate dynamic test steps using LLM with full test case context and visual information
132135
133136
Args:
134137
dom_diff: New DOM elements detected
135-
last_action: The action that triggered the new elements (successfully executed)
138+
last_action: The action that triggered the new elements
136139
test_objective: Overall test objective
137140
executed_steps: Number of steps executed so far
138141
max_steps: Maximum number of steps to generate
139142
llm: LLM instance for generation
140143
current_case: Complete test case containing all steps for context
141144
screenshot: Base64 screenshot of current page state for visual context
145+
tool_output: Output from the tool execution for context (optional)
146+
step_success: Whether the previous step executed successfully (default: True)
142147
143148
Returns:
144149
Dict containing strategy ("insert" or "replace") and generated test steps
@@ -179,78 +184,59 @@ async def generate_dynamic_steps_with_llm(
179184
{json.dumps(remaining_steps, ensure_ascii=False, indent=2) if remaining_steps else "None"}
180185
"""
181186

182-
# Build multi-modal user prompt with success context and insertion strategy
187+
# Build multi-modal user prompt with dynamic status context
183188
visual_context_section = ""
184189
if screenshot:
190+
execution_context = "AFTER the execution of the last action" if step_success else "AFTER the attempted execution of the last action"
185191
visual_context_section = f"""
186192
## Current Page Visual Context
187-
The attached screenshot shows the current state of the page AFTER the successful execution of the last action.
193+
The attached screenshot shows the current state of the page {execution_context}.
188194
Use this visual information along with the DOM diff to understand the complete UI state.
195+
"""
196+
197+
# Build context based on actual execution result
198+
if step_success:
199+
action_status = f"✅ SUCCESSFULLY EXECUTED: \"{last_action}\""
200+
status_context = "The above action has been completed successfully. Do NOT re-plan or duplicate this action."
201+
execution_description = "After the successful action execution"
202+
else:
203+
action_status = f"⚠️ FAILED/PARTIAL EXECUTION: \"{last_action}\""
204+
status_context = "The above action failed or partially succeeded. Consider recovery steps or alternative approaches."
205+
execution_description = "After the failed/partial action execution"
206+
207+
# Include tool output for better context
208+
tool_output_section = ""
209+
if tool_output:
210+
# Truncate if too long to prevent prompt overflow
211+
tool_output_section = f"""
212+
213+
## Execution Details
214+
{tool_output}
189215
"""
190216

191217
user_prompt = f"""
192218
## Previous Action Status
193-
✅ SUCCESSFULLY EXECUTED: "{last_action}"
194-
The above action has been completed successfully. Do NOT re-plan or duplicate this action.
219+
{action_status}
220+
{status_context}{tool_output_section}
195221
196222
## New UI Elements Detected
197-
After the successful action execution, {len(new_elements)} new UI elements appeared:
223+
{execution_description}, {len(new_elements)} new UI elements appeared:
198224
{json.dumps(new_elements, ensure_ascii=False, indent=2)}
199225
200226
{visual_context_section}
201227
202228
{test_case_context}
203229
204-
## Structured Analysis Requirements
230+
## Analysis Context
205231
Max steps to generate: {max_steps}
206232
Test Objective: "{test_objective}"
207233
208-
### Step 1: Calculate Objective Completion Score
209-
Assess what percentage of the remaining test objective can be achieved using ONLY these new elements:
210-
- **100%**: New elements fully complete ALL remaining objectives independently
211-
- **75-99%**: Elements achieve most objectives with minor gaps
212-
- **25-74%**: Significant contribution but requires original steps
213-
- **0-24%**: Minimal or supplementary value only
214-
215-
### Step 2: Apply Quantitative Decision Framework
216-
**Primary Decision Rules:**
217-
- Score ≥ 75% AND remaining steps don't test different aspects → "replace"
218-
- Score < 75% OR remaining steps test different aspects → "insert"
219-
220-
### Step 3: Binary Validation Checklist
221-
Answer these YES/NO questions:
222-
□ Can new elements complete the test objective independently?
223-
□ Do remaining steps become unnecessary after using new elements?
224-
□ Do new elements test the SAME aspects as remaining steps?
225-
□ Is there a more efficient path through new elements?
226-
227-
**Scoring**: 3+ YES → "replace", ≤2 YES → "insert"
228-
229-
### Step 4: Generate Structured Response
230-
Return your analysis in this EXACT format:
231-
```json
232-
{{
233-
"analysis": {{
234-
"objective_completion_score": [0-100],
235-
"can_complete_objective_alone": [true/false],
236-
"remaining_steps_redundant": [true/false],
237-
"confidence_level": ["HIGH"|"MEDIUM"|"LOW"]
238-
}},
239-
"strategy": "insert" or "replace",
240-
"reason": "Based on [X]% completion score: [detailed explanation of decision logic]",
241-
"steps": [
242-
{{"action": "specific action description"}},
243-
{{"verify": "specific verification description"}}
244-
]
245-
}}
246-
```
247-
248-
**For irrelevant elements**: {{"analysis": {{"objective_completion_score": 0, "can_complete_objective_alone": false, "remaining_steps_redundant": false, "confidence_level": "HIGH"}}, "strategy": "insert", "reason": "Elements provide no functional value", "steps": []}}
234+
Please analyze these new UI elements using the QAG methodology and generate appropriate test steps if needed.
249235
"""
250236

251237
logging.debug(f"Requesting LLM to generate dynamic steps for {len(new_elements)} new elements")
252238

253-
# Call LLM with multi-modal context if screenshot available
239+
# Call LLM with proper message structure
254240
if screenshot:
255241
# Multi-modal call with screenshot
256242
messages = [
@@ -266,10 +252,14 @@ async def generate_dynamic_steps_with_llm(
266252
]
267253
}
268254
]
269-
response = await llm.ainvoke(messages)
270255
else:
271-
# Text-only call
272-
response = await llm.ainvoke(system_prompt + "\\n" + user_prompt)
256+
# Text-only call with proper message structure
257+
messages = [
258+
{"role": "system", "content": system_prompt},
259+
{"role": "user", "content": user_prompt}
260+
]
261+
262+
response = await llm.ainvoke(messages)
273263

274264
# Parse response
275265
if hasattr(response, 'content'):
@@ -289,27 +279,29 @@ async def generate_dynamic_steps_with_llm(
289279
reason = result.get("reason", "No reason provided")
290280
steps = result.get("steps", [])
291281

292-
# Extract and validate analysis fields (new format)
282+
# Extract and validate analysis fields (QAG format)
293283
analysis = result.get("analysis", {})
294-
completion_score = analysis.get("objective_completion_score", 0) if isinstance(analysis, dict) else 0
295-
can_complete_alone = analysis.get("can_complete_objective_alone", False) if isinstance(analysis, dict) else False
296-
steps_redundant = analysis.get("remaining_steps_redundant", False) if isinstance(analysis, dict) else False
297-
confidence = analysis.get("confidence_level", "MEDIUM") if isinstance(analysis, dict) else "MEDIUM"
284+
q1_can_complete_alone = analysis.get("q1_can_complete_alone", False) if isinstance(analysis, dict) else False
285+
q2_different_aspects = analysis.get("q2_different_aspects", False) if isinstance(analysis, dict) else False
286+
q3_remaining_redundant = analysis.get("q3_remaining_redundant", False) if isinstance(analysis, dict) else False
298287

299288
# Validate strategy value
300289
if strategy not in ["insert", "replace"]:
301290
logging.warning(f"Invalid strategy '{strategy}', defaulting to 'insert'")
302291
strategy = "insert"
303292

304-
# Validate completion score if provided
305-
if not isinstance(completion_score, (int, float)) or not (0 <= completion_score <= 100):
306-
logging.debug(f"Invalid completion score {completion_score}, defaulting to 0")
307-
completion_score = 0
293+
# Validate QAG analysis fields
294+
if not isinstance(q1_can_complete_alone, bool):
295+
logging.debug(f"Invalid q1_can_complete_alone {q1_can_complete_alone}, defaulting to False")
296+
q1_can_complete_alone = False
308297

309-
# Validate confidence level
310-
if confidence not in ["HIGH", "MEDIUM", "LOW"]:
311-
logging.debug(f"Invalid confidence level {confidence}, defaulting to MEDIUM")
312-
confidence = "MEDIUM"
298+
if not isinstance(q2_different_aspects, bool):
299+
logging.debug(f"Invalid q2_different_aspects {q2_different_aspects}, defaulting to False")
300+
q2_different_aspects = False
301+
302+
if not isinstance(q3_remaining_redundant, bool):
303+
logging.debug(f"Invalid q3_remaining_redundant {q3_remaining_redundant}, defaulting to False")
304+
q3_remaining_redundant = False
313305

314306
# Validate and limit step count
315307
valid_steps = []
@@ -318,30 +310,26 @@ async def generate_dynamic_steps_with_llm(
318310
if isinstance(step, dict) and ("action" in step or "verify" in step):
319311
valid_steps.append(step)
320312

321-
# Enhanced logging with analysis data
322-
if completion_score > 0:
323-
logging.info(f"Generated {len(valid_steps)} dynamic steps with strategy '{strategy}' (score: {completion_score}%, confidence: {confidence}) from {len(new_elements)} new elements")
324-
else:
325-
logging.info(f"Generated {len(valid_steps)} dynamic steps with strategy '{strategy}' from {len(new_elements)} new elements")
313+
# Enhanced logging with QAG analysis data
314+
logging.info(f"Generated {len(valid_steps)} dynamic steps with strategy '{strategy}' from {len(new_elements)} new elements")
326315

327316
logging.debug(f"Strategy reason: {reason}")
328317
if analysis:
329-
logging.debug(f"Analysis: completion_score={completion_score}%, can_complete_alone={can_complete_alone}, steps_redundant={steps_redundant}, confidence={confidence}")
318+
logging.debug(f"QAG Analysis: q1_can_complete_alone={q1_can_complete_alone}, q2_different_aspects={q2_different_aspects}, q3_remaining_redundant={q3_remaining_redundant}")
330319

331-
# Return enhanced result with analysis
320+
# Return enhanced result with QAG analysis
332321
result_data = {
333322
"strategy": strategy,
334323
"reason": reason,
335324
"steps": valid_steps
336325
}
337326

338-
# Include analysis if provided (backward compatibility)
327+
# Include QAG analysis if provided
339328
if analysis:
340329
result_data["analysis"] = {
341-
"objective_completion_score": completion_score,
342-
"can_complete_objective_alone": can_complete_alone,
343-
"remaining_steps_redundant": steps_redundant,
344-
"confidence_level": confidence
330+
"q1_can_complete_alone": q1_can_complete_alone,
331+
"q2_different_aspects": q2_different_aspects,
332+
"q3_remaining_redundant": q3_remaining_redundant
345333
}
346334

347335
return result_data
@@ -583,6 +571,7 @@ def extract_path(u):
583571
total_steps = len(case.get("steps", []))
584572
failed_steps = [] # Track failed steps for summary generation
585573
case_modified = False # Track if case was modified with dynamic steps
574+
dynamic_generation_count = 0 # Track how many times dynamic generation occurred
586575

587576
for i, step in enumerate(case.get("steps", [])):
588577
instruction_to_execute = step.get("action") or step.get("verify")
@@ -707,7 +696,7 @@ def extract_path(u):
707696

708697
# --- Dynamic Step Generation ---
709698
# Check if dynamic step generation is enabled and current step succeeded
710-
if (i+1) not in failed_steps and step_type == "Action" and "[success]" in result['intermediate_steps'][0][1].lower():
699+
if step_type == "Action":
711700
# Get dynamic step generation config from state
712701
dynamic_config = state.get("dynamic_step_generation", {
713702
"enabled": False,
@@ -731,16 +720,28 @@ def extract_path(u):
731720
logging.debug("Capturing screenshot for dynamic step generation context")
732721
screenshot = await ui_tester_instance._actions.b64_page_screenshot()
733722

723+
# Enhance objective with generation context for smarter LLM decision-making
724+
enhanced_objective = case.get("objective", "")
725+
if dynamic_generation_count > 0:
726+
enhanced_objective += f" (Context: Already generated {dynamic_generation_count} rounds of dynamic steps, be selective about additional generation)"
727+
if i+1 > LONG_STEPS: # Long test indicator
728+
enhanced_objective += f" (Context: Test already has {i+1} steps, consider if more steps add meaningful value)"
729+
730+
# Determine if current step succeeded based on failed_steps list
731+
step_success = (i + 1) not in failed_steps
732+
734733
# Generate dynamic test steps with complete context and visual information
735734
dynamic_result = await generate_dynamic_steps_with_llm(
736735
dom_diff=dom_diff,
737736
last_action=instruction_to_execute,
738-
test_objective=case.get("objective", ""),
737+
test_objective=enhanced_objective,
739738
executed_steps=i+1,
740739
max_steps=max_dynamic_steps,
741740
llm=llm,
742741
current_case=case,
743-
screenshot=screenshot
742+
screenshot=screenshot,
743+
tool_output=tool_output,
744+
step_success=step_success
744745
)
745746

746747
# Handle dynamic steps based on LLM strategy decision
@@ -752,13 +753,35 @@ def extract_path(u):
752753
logging.info(f"Generated {len(dynamic_steps)} dynamic test steps with strategy '{strategy}': {reason}")
753754
case_steps = case.get("steps", [])
754755

755-
# Convert dynamic steps to the standard format
756+
# Increment generation count since we're actually adding steps
757+
dynamic_generation_count += 1
758+
759+
# Convert dynamic steps to the standard format and filter duplicates
760+
def is_similar_step(step1: dict, step2: dict) -> bool:
761+
"""Check if two steps are similar to avoid duplicates"""
762+
if "action" in step1 and "action" in step2:
763+
return step1["action"].lower().strip() == step2["action"].lower().strip()
764+
if "verify" in step1 and "verify" in step2:
765+
return step1["verify"].lower().strip() == step2["verify"].lower().strip()
766+
return False
767+
756768
formatted_dynamic_steps = []
769+
executed_and_remaining = case_steps[:i+1] + case_steps[i+1:] # All existing steps
770+
757771
for dyn_step in dynamic_steps:
758-
if "action" in dyn_step:
759-
formatted_dynamic_steps.append({"action": dyn_step["action"]})
760-
if "verify" in dyn_step:
761-
formatted_dynamic_steps.append({"verify": dyn_step["verify"]})
772+
# Check for duplicates before adding
773+
is_duplicate = False
774+
for existing_step in executed_and_remaining:
775+
if is_similar_step(dyn_step, existing_step):
776+
logging.debug(f"Skipping duplicate step: {dyn_step}")
777+
is_duplicate = True
778+
break
779+
780+
if not is_duplicate:
781+
if "action" in dyn_step:
782+
formatted_dynamic_steps.append({"action": dyn_step["action"]})
783+
if "verify" in dyn_step:
784+
formatted_dynamic_steps.append({"verify": dyn_step["verify"]})
762785

763786
# Apply strategy: insert or replace
764787
if strategy == "replace":

0 commit comments

Comments
 (0)