2121from webqa_agent .testers .case_gen .utils .message_converter import convert_intermediate_steps_to_messages
2222from webqa_agent .utils .log_icon import icon
2323
24+ LONG_STEPS = 10
2425
2526# ============================================================================
2627# Dynamic Step Generation Helper Functions
@@ -126,19 +127,23 @@ async def generate_dynamic_steps_with_llm(
126127 max_steps : int ,
127128 llm : any ,
128129 current_case : dict = None ,
129- screenshot : str = None
130+ screenshot : str = None ,
131+ tool_output : str = None ,
132+ step_success : bool = True
130133) -> dict :
131134 """Generate dynamic test steps using LLM with full test case context and visual information
132135
133136 Args:
134137 dom_diff: New DOM elements detected
135- last_action: The action that triggered the new elements (successfully executed)
138+ last_action: The action that triggered the new elements
136139 test_objective: Overall test objective
137140 executed_steps: Number of steps executed so far
138141 max_steps: Maximum number of steps to generate
139142 llm: LLM instance for generation
140143 current_case: Complete test case containing all steps for context
141144 screenshot: Base64 screenshot of current page state for visual context
145+ tool_output: Output from the tool execution for context (optional)
146+ step_success: Whether the previous step executed successfully (default: True)
142147
143148 Returns:
144149 Dict containing strategy ("insert" or "replace") and generated test steps
@@ -179,78 +184,59 @@ async def generate_dynamic_steps_with_llm(
179184{ json .dumps (remaining_steps , ensure_ascii = False , indent = 2 ) if remaining_steps else "None" }
180185"""
181186
182- # Build multi-modal user prompt with success context and insertion strategy
187+ # Build multi-modal user prompt with dynamic status context
183188 visual_context_section = ""
184189 if screenshot :
190+ execution_context = "AFTER the execution of the last action" if step_success else "AFTER the attempted execution of the last action"
185191 visual_context_section = f"""
186192## Current Page Visual Context
187- The attached screenshot shows the current state of the page AFTER the successful execution of the last action .
193+ The attached screenshot shows the current state of the page { execution_context } .
188194Use this visual information along with the DOM diff to understand the complete UI state.
195+ """
196+
197+ # Build context based on actual execution result
198+ if step_success :
199+ action_status = f"✅ SUCCESSFULLY EXECUTED: \" { last_action } \" "
200+ status_context = "The above action has been completed successfully. Do NOT re-plan or duplicate this action."
201+ execution_description = "After the successful action execution"
202+ else :
203+ action_status = f"⚠️ FAILED/PARTIAL EXECUTION: \" { last_action } \" "
204+ status_context = "The above action failed or partially succeeded. Consider recovery steps or alternative approaches."
205+ execution_description = "After the failed/partial action execution"
206+
207+ # Include tool output for better context
208+ tool_output_section = ""
209+ if tool_output :
210+ # Truncate if too long to prevent prompt overflow
211+ tool_output_section = f"""
212+
213+ ## Execution Details
214+ { tool_output }
189215"""
190216
191217 user_prompt = f"""
192218## Previous Action Status
193- ✅ SUCCESSFULLY EXECUTED: " { last_action } "
194- The above action has been completed successfully. Do NOT re-plan or duplicate this action.
219+ { action_status }
220+ { status_context } { tool_output_section }
195221
196222## New UI Elements Detected
197- After the successful action execution , { len (new_elements )} new UI elements appeared:
223+ { execution_description } , { len (new_elements )} new UI elements appeared:
198224{ json .dumps (new_elements , ensure_ascii = False , indent = 2 )}
199225
200226{ visual_context_section }
201227
202228{ test_case_context }
203229
204- ## Structured Analysis Requirements
230+ ## Analysis Context
205231Max steps to generate: { max_steps }
206232Test Objective: "{ test_objective } "
207233
208- ### Step 1: Calculate Objective Completion Score
209- Assess what percentage of the remaining test objective can be achieved using ONLY these new elements:
210- - **100%**: New elements fully complete ALL remaining objectives independently
211- - **75-99%**: Elements achieve most objectives with minor gaps
212- - **25-74%**: Significant contribution but requires original steps
213- - **0-24%**: Minimal or supplementary value only
214-
215- ### Step 2: Apply Quantitative Decision Framework
216- **Primary Decision Rules:**
217- - Score ≥ 75% AND remaining steps don't test different aspects → "replace"
218- - Score < 75% OR remaining steps test different aspects → "insert"
219-
220- ### Step 3: Binary Validation Checklist
221- Answer these YES/NO questions:
222- □ Can new elements complete the test objective independently?
223- □ Do remaining steps become unnecessary after using new elements?
224- □ Do new elements test the SAME aspects as remaining steps?
225- □ Is there a more efficient path through new elements?
226-
227- **Scoring**: 3+ YES → "replace", ≤2 YES → "insert"
228-
229- ### Step 4: Generate Structured Response
230- Return your analysis in this EXACT format:
231- ```json
232- {{
233- "analysis": {{
234- "objective_completion_score": [0-100],
235- "can_complete_objective_alone": [true/false],
236- "remaining_steps_redundant": [true/false],
237- "confidence_level": ["HIGH"|"MEDIUM"|"LOW"]
238- }},
239- "strategy": "insert" or "replace",
240- "reason": "Based on [X]% completion score: [detailed explanation of decision logic]",
241- "steps": [
242- {{"action": "specific action description"}},
243- {{"verify": "specific verification description"}}
244- ]
245- }}
246- ```
247-
248- **For irrelevant elements**: {{"analysis": {{"objective_completion_score": 0, "can_complete_objective_alone": false, "remaining_steps_redundant": false, "confidence_level": "HIGH"}}, "strategy": "insert", "reason": "Elements provide no functional value", "steps": []}}
234+ Please analyze these new UI elements using the QAG methodology and generate appropriate test steps if needed.
249235 """
250236
251237 logging .debug (f"Requesting LLM to generate dynamic steps for { len (new_elements )} new elements" )
252238
253- # Call LLM with multi-modal context if screenshot available
239+ # Call LLM with proper message structure
254240 if screenshot :
255241 # Multi-modal call with screenshot
256242 messages = [
@@ -266,10 +252,14 @@ async def generate_dynamic_steps_with_llm(
266252 ]
267253 }
268254 ]
269- response = await llm .ainvoke (messages )
270255 else :
271- # Text-only call
272- response = await llm .ainvoke (system_prompt + "\\ n" + user_prompt )
256+ # Text-only call with proper message structure
257+ messages = [
258+ {"role" : "system" , "content" : system_prompt },
259+ {"role" : "user" , "content" : user_prompt }
260+ ]
261+
262+ response = await llm .ainvoke (messages )
273263
274264 # Parse response
275265 if hasattr (response , 'content' ):
@@ -289,27 +279,29 @@ async def generate_dynamic_steps_with_llm(
289279 reason = result .get ("reason" , "No reason provided" )
290280 steps = result .get ("steps" , [])
291281
292- # Extract and validate analysis fields (new format)
282+ # Extract and validate analysis fields (QAG format)
293283 analysis = result .get ("analysis" , {})
294- completion_score = analysis .get ("objective_completion_score" , 0 ) if isinstance (analysis , dict ) else 0
295- can_complete_alone = analysis .get ("can_complete_objective_alone" , False ) if isinstance (analysis , dict ) else False
296- steps_redundant = analysis .get ("remaining_steps_redundant" , False ) if isinstance (analysis , dict ) else False
297- confidence = analysis .get ("confidence_level" , "MEDIUM" ) if isinstance (analysis , dict ) else "MEDIUM"
284+ q1_can_complete_alone = analysis .get ("q1_can_complete_alone" , False ) if isinstance (analysis , dict ) else False
285+ q2_different_aspects = analysis .get ("q2_different_aspects" , False ) if isinstance (analysis , dict ) else False
286+ q3_remaining_redundant = analysis .get ("q3_remaining_redundant" , False ) if isinstance (analysis , dict ) else False
298287
299288 # Validate strategy value
300289 if strategy not in ["insert" , "replace" ]:
301290 logging .warning (f"Invalid strategy '{ strategy } ', defaulting to 'insert'" )
302291 strategy = "insert"
303292
304- # Validate completion score if provided
305- if not isinstance (completion_score , ( int , float )) or not ( 0 <= completion_score <= 100 ):
306- logging .debug (f"Invalid completion score { completion_score } , defaulting to 0 " )
307- completion_score = 0
293+ # Validate QAG analysis fields
294+ if not isinstance (q1_can_complete_alone , bool ):
295+ logging .debug (f"Invalid q1_can_complete_alone { q1_can_complete_alone } , defaulting to False " )
296+ q1_can_complete_alone = False
308297
309- # Validate confidence level
310- if confidence not in ["HIGH" , "MEDIUM" , "LOW" ]:
311- logging .debug (f"Invalid confidence level { confidence } , defaulting to MEDIUM" )
312- confidence = "MEDIUM"
298+ if not isinstance (q2_different_aspects , bool ):
299+ logging .debug (f"Invalid q2_different_aspects { q2_different_aspects } , defaulting to False" )
300+ q2_different_aspects = False
301+
302+ if not isinstance (q3_remaining_redundant , bool ):
303+ logging .debug (f"Invalid q3_remaining_redundant { q3_remaining_redundant } , defaulting to False" )
304+ q3_remaining_redundant = False
313305
314306 # Validate and limit step count
315307 valid_steps = []
@@ -318,30 +310,26 @@ async def generate_dynamic_steps_with_llm(
318310 if isinstance (step , dict ) and ("action" in step or "verify" in step ):
319311 valid_steps .append (step )
320312
321- # Enhanced logging with analysis data
322- if completion_score > 0 :
323- logging .info (f"Generated { len (valid_steps )} dynamic steps with strategy '{ strategy } ' (score: { completion_score } %, confidence: { confidence } ) from { len (new_elements )} new elements" )
324- else :
325- logging .info (f"Generated { len (valid_steps )} dynamic steps with strategy '{ strategy } ' from { len (new_elements )} new elements" )
313+ # Enhanced logging with QAG analysis data
314+ logging .info (f"Generated { len (valid_steps )} dynamic steps with strategy '{ strategy } ' from { len (new_elements )} new elements" )
326315
327316 logging .debug (f"Strategy reason: { reason } " )
328317 if analysis :
329- logging .debug (f"Analysis: completion_score= { completion_score } %, can_complete_alone= { can_complete_alone } , steps_redundant= { steps_redundant } , confidence= { confidence } " )
318+ logging .debug (f"QAG Analysis: q1_can_complete_alone= { q1_can_complete_alone } , q2_different_aspects= { q2_different_aspects } , q3_remaining_redundant= { q3_remaining_redundant } " )
330319
331- # Return enhanced result with analysis
320+ # Return enhanced result with QAG analysis
332321 result_data = {
333322 "strategy" : strategy ,
334323 "reason" : reason ,
335324 "steps" : valid_steps
336325 }
337326
338- # Include analysis if provided (backward compatibility)
327+ # Include QAG analysis if provided
339328 if analysis :
340329 result_data ["analysis" ] = {
341- "objective_completion_score" : completion_score ,
342- "can_complete_objective_alone" : can_complete_alone ,
343- "remaining_steps_redundant" : steps_redundant ,
344- "confidence_level" : confidence
330+ "q1_can_complete_alone" : q1_can_complete_alone ,
331+ "q2_different_aspects" : q2_different_aspects ,
332+ "q3_remaining_redundant" : q3_remaining_redundant
345333 }
346334
347335 return result_data
@@ -583,6 +571,7 @@ def extract_path(u):
583571 total_steps = len (case .get ("steps" , []))
584572 failed_steps = [] # Track failed steps for summary generation
585573 case_modified = False # Track if case was modified with dynamic steps
574+ dynamic_generation_count = 0 # Track how many times dynamic generation occurred
586575
587576 for i , step in enumerate (case .get ("steps" , [])):
588577 instruction_to_execute = step .get ("action" ) or step .get ("verify" )
@@ -707,7 +696,7 @@ def extract_path(u):
707696
708697 # --- Dynamic Step Generation ---
709698 # Check if dynamic step generation is enabled and current step succeeded
710- if ( i + 1 ) not in failed_steps and step_type == "Action" and "[success]" in result [ 'intermediate_steps' ][ 0 ][ 1 ]. lower () :
699+ if step_type == "Action" :
711700 # Get dynamic step generation config from state
712701 dynamic_config = state .get ("dynamic_step_generation" , {
713702 "enabled" : False ,
@@ -731,16 +720,28 @@ def extract_path(u):
731720 logging .debug ("Capturing screenshot for dynamic step generation context" )
732721 screenshot = await ui_tester_instance ._actions .b64_page_screenshot ()
733722
723+ # Enhance objective with generation context for smarter LLM decision-making
724+ enhanced_objective = case .get ("objective" , "" )
725+ if dynamic_generation_count > 0 :
726+ enhanced_objective += f" (Context: Already generated { dynamic_generation_count } rounds of dynamic steps, be selective about additional generation)"
727+ if i + 1 > LONG_STEPS : # Long test indicator
728+ enhanced_objective += f" (Context: Test already has { i + 1 } steps, consider if more steps add meaningful value)"
729+
730+ # Determine if current step succeeded based on failed_steps list
731+ step_success = (i + 1 ) not in failed_steps
732+
734733 # Generate dynamic test steps with complete context and visual information
735734 dynamic_result = await generate_dynamic_steps_with_llm (
736735 dom_diff = dom_diff ,
737736 last_action = instruction_to_execute ,
738- test_objective = case . get ( "objective" , "" ) ,
737+ test_objective = enhanced_objective ,
739738 executed_steps = i + 1 ,
740739 max_steps = max_dynamic_steps ,
741740 llm = llm ,
742741 current_case = case ,
743- screenshot = screenshot
742+ screenshot = screenshot ,
743+ tool_output = tool_output ,
744+ step_success = step_success
744745 )
745746
746747 # Handle dynamic steps based on LLM strategy decision
@@ -752,13 +753,35 @@ def extract_path(u):
752753 logging .info (f"Generated { len (dynamic_steps )} dynamic test steps with strategy '{ strategy } ': { reason } " )
753754 case_steps = case .get ("steps" , [])
754755
755- # Convert dynamic steps to the standard format
756+ # Increment generation count since we're actually adding steps
757+ dynamic_generation_count += 1
758+
759+ # Convert dynamic steps to the standard format and filter duplicates
760+ def is_similar_step (step1 : dict , step2 : dict ) -> bool :
761+ """Check if two steps are similar to avoid duplicates"""
762+ if "action" in step1 and "action" in step2 :
763+ return step1 ["action" ].lower ().strip () == step2 ["action" ].lower ().strip ()
764+ if "verify" in step1 and "verify" in step2 :
765+ return step1 ["verify" ].lower ().strip () == step2 ["verify" ].lower ().strip ()
766+ return False
767+
756768 formatted_dynamic_steps = []
769+ executed_and_remaining = case_steps [:i + 1 ] + case_steps [i + 1 :] # All existing steps
770+
757771 for dyn_step in dynamic_steps :
758- if "action" in dyn_step :
759- formatted_dynamic_steps .append ({"action" : dyn_step ["action" ]})
760- if "verify" in dyn_step :
761- formatted_dynamic_steps .append ({"verify" : dyn_step ["verify" ]})
772+ # Check for duplicates before adding
773+ is_duplicate = False
774+ for existing_step in executed_and_remaining :
775+ if is_similar_step (dyn_step , existing_step ):
776+ logging .debug (f"Skipping duplicate step: { dyn_step } " )
777+ is_duplicate = True
778+ break
779+
780+ if not is_duplicate :
781+ if "action" in dyn_step :
782+ formatted_dynamic_steps .append ({"action" : dyn_step ["action" ]})
783+ if "verify" in dyn_step :
784+ formatted_dynamic_steps .append ({"verify" : dyn_step ["verify" ]})
762785
763786 # Apply strategy: insert or replace
764787 if strategy == "replace" :
0 commit comments