feat: 1. add schema for action tool and verify tool; 2.synchronize actions with action_executor.py; 3. update crawl parameters

seancoding-day · seancoding-day · commit 832270428322 · 2025-11-17T16:45:18.000+08:00
diff --git a/webqa_agent/actions/action_executor.py b/webqa_agent/actions/action_executor.py
@@ -145,7 +145,7 @@ async def _execute_keyboard_press(self, action):
         else:
             return {"success": False, "message": "Keyboard press failed."}
 
-    async def _execute_get_new_page(self, action):
+    async def _execute_get_new_page(self):
         """Execute get new page action."""
         success = await self._actions.get_new_page()
         if success:
@@ -313,7 +313,7 @@ async def _execute_go_to_page(self, action):
             logging.error(f"Go to page action failed: {str(e)}")
             return {"success": False, "message": f"Navigation failed: {str(e)}", "playwright_error": str(e)}
 
-    async def _execute_go_back(self, action):
+    async def _execute_go_back(self):
         """Execute browser back navigation action."""
         try:
             if hasattr(self._actions, 'go_back'):
diff --git a/webqa_agent/testers/case_gen/graph.py b/webqa_agent/testers/case_gen/graph.py
@@ -296,7 +296,7 @@ async def reflect_and_replan(state: MainGraphState) -> dict:
     logging.debug(f"current page crawled result: {page_content_summary}")
     screenshot = await ui_tester._actions.b64_page_screenshot(file_name="reflection", save_to_log=False, full_page=False)
     await dp.remove_marker()
-    await dp.crawl(highlight=False, highlight_text=True, viewport_only=True)
+    await dp.crawl(highlight=False, filter_text=True, viewport_only=True)
     page_structure = dp.get_text()
     logging.debug(f"----- reflection ---- Page structure: {page_structure}")
 
diff --git a/webqa_agent/testers/case_gen/prompts/agent_prompts.py b/webqa_agent/testers/case_gen/prompts/agent_prompts.py
@@ -32,7 +32,7 @@ def get_execute_system_prompt(case: dict) -> str:
 
 - **`execute_ui_action(action: str, target: str, value: Optional[str], description: Optional[str], clear_before_type: bool)`**:
   Performs UI interactions such as clicking, typing, scrolling, dropdown selection, etc.
-  - `action`: Action type ('click', 'type', 'scroll', 'SelectDropdown', 'clear', etc.)
+  - `action`: Action type ('Tap', 'Input', 'Scroll', 'SelectDropdown', 'Clear', 'Hover', 'KeyboardPress', 'Upload', 'Drag', 'GoToPage', 'GoBack', 'Sleep', 'GetNewPage', 'Mouse')
   - `target`: Element descriptor (use natural language descriptions)
   - `value`: Input value for text-based actions
   - `description`: Purpose of the action for logging and context
@@ -285,9 +285,9 @@ def get_execute_system_prompt(case: dict) -> str:
 
 ### Example 1: Form Field Validation Recovery
 **Context**: Registration form with character length requirements
-**Initial Action**: `execute_ui_action(action='type', target='usage scenario field', value='test', description='Enter usage scenario')`
+**Initial Action**: `execute_ui_action(action='Input', target='usage scenario field', value='test', description='Enter usage scenario')`
 **Tool Response**: `[FAILURE] Validation error detected: Usage scenario must be at least 30 characters`
-**Recovery Action**: `execute_ui_action(action='type', target='usage scenario field', value='This is a comprehensive usage scenario description for research and development purposes in academic and commercial settings', description='Enter extended usage scenario meeting length requirements', clear_before_type=True)`
+**Recovery Action**: `execute_ui_action(action='Input', target='usage scenario field', value='This is a comprehensive usage scenario description for research and development purposes in academic and commercial settings', description='Enter extended usage scenario meeting length requirements', clear_before_type=True)`
 
 ### Example 2: Dropdown Language Adaptation
 **Context**: Bilingual interface with Chinese dropdown options
@@ -297,37 +297,37 @@ def get_execute_system_prompt(case: dict) -> str:
 
 ### Example 3: Dynamic Content Waiting
 **Context**: API-populated dropdown requiring wait time
-**Step 1**: `execute_ui_action(action='click', target='country dropdown', description='Open country selection dropdown')`
+**Step 1**: `execute_ui_action(action='Tap', target='country dropdown', description='Open country selection dropdown')`
 **Tool Response**: `[SUCCESS] Dropdown opened, loading options...`
-**Step 2**: `execute_ui_action(action='sleep', target='', value='2000', description='Wait for options to load')`
-**Step 3**: `execute_ui_action(action='click', target='option containing "Canada"', description='Select Canada from loaded options')`
+**Step 2**: `execute_ui_action(action='Sleep', target='', value='2000', description='Wait for options to load')`
+**Step 3**: `execute_ui_action(action='Tap', target='option containing "Canada"', description='Select Canada from loaded options')`
 
 ### Example 4: Element State Change Handling
 **Context**: Button state change after interaction
-**Initial Action**: `execute_ui_action(action='click', target='submit button', description='Submit form')`
+**Initial Action**: `execute_ui_action(action='Tap', target='submit button', description='Submit form')`
 **Tool Response**: `[SUCCESS] Form submitted, button disabled and showing 'Processing...'`
-**Recovery Action**: `execute_ui_action(action='wait', target='', value='3000', description='Wait for processing to complete')`
+**Recovery Action**: `execute_ui_action(action='Sleep', target='', value='3000', description='Wait for processing to complete')`
 **Follow-up**: `execute_ui_assertion(assertion='Verify success message appears and button returns to normal state')`
 
 ### Example 5: Multi-Action Instruction Handling
 **Context**: Instruction contains multiple actions "Browse the homepage top navigation bar, click one by one: 'Visitor', 'Alumni', 'Donate', 'Careers' links"
 **First Action Identification**: The first mentioned action is "Visitor" link
-**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='click', target='Visitor link', description='Click the visitor link in the top navigation bar')`
-**Tool Response**: `[SUCCESS] Action 'click' on 'Visitor link' completed successfully`
+**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='Tap', target='Visitor link', description='Click the visitor link in the top navigation bar')`
+**Tool Response**: `[SUCCESS] Action 'Tap' on 'Visitor link' completed successfully`
 **Agent Reporting**: Report completion of the single action and allow framework to proceed to next step
 
 ### Example 6: Another Multi-Action Instruction Handling
 **Context**: Instruction contains "Click on the 'Login', 'Register', and 'Help' links in the header"
 **First Action Identification**: The first mentioned action is "Login" link
-**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='click', target='Login link', description='Click the Login link in the header')`
-**Tool Response**: `[SUCCESS] Action 'click' on 'Login link' completed successfully`
+**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='Tap', target='Login link', description='Click the Login link in the header')`
+**Tool Response**: `[SUCCESS] Action 'Tap' on 'Login link' completed successfully`
 **Agent Reporting**: Report completion of the single action and allow framework to proceed to next step
 
 ### Example 7: Numbered List Multi-Action Handling
 **Context**: Instruction contains "1. Enter username 2. Enter password 3. Click submit"
 **First Action Identification**: The numbered step #1 is "Enter username"
-**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='type', target='username field', value='testuser', description='Enter username in the username field')`
-**Tool Response**: `[SUCCESS] Action 'type' on 'username field' completed successfully`
+**Correct Agent Response**: Execute only the FIRST action - `execute_ui_action(action='Input', target='username field', value='testuser', description='Enter username in the username field')`
+**Tool Response**: `[SUCCESS] Action 'Input' on 'username field' completed successfully`
 **Agent Reporting**: Report completion of the single action and allow framework to proceed to next step
 
 ## Test Completion Protocol
diff --git a/webqa_agent/testers/case_gen/tools/element_action_tool.py b/webqa_agent/testers/case_gen/tools/element_action_tool.py
@@ -7,21 +7,88 @@
 import datetime
 import json
 import logging
-from typing import Any, Dict, Optional
+from typing import Optional, Type
 
 from langchain_core.tools import BaseTool
-from pydantic import Field
+from pydantic import BaseModel, Field
 
 from webqa_agent.crawler.deep_crawler import DeepCrawler
-from webqa_agent.testers.case_gen.prompts.tool_prompts import get_error_detection_prompt
 from webqa_agent.testers.function_tester import UITester
 
 
+class UIActionSchema(BaseModel):
+    """Schema for UI action tool arguments."""
+
+    action: str = Field(
+        description=(
+            "Type of UI action to perform. Supported actions: "
+            "'Tap' - Click on an element; "
+            "'Input' - Type text into an input field; "
+            "'SelectDropdown' - Select an option from a dropdown menu (supports cascade selection with comma-separated paths); "
+            "'Scroll' - Scroll the page with configurable modes ('once', 'untilBottom', 'untilTop') and optional distance; "
+            "'Clear' - Clear the content of an input field; "
+            "'Hover' - Hover over an element; "
+            "'KeyboardPress' - Press a keyboard key; "
+            "'Upload' - Upload a file; "
+            "'Drag' - Drag an element to a target position; "
+            "'GoToPage' - Navigate to a URL; "
+            "'GoBack' - Navigate back to the previous page; "
+            "'Sleep' - Wait for a specified duration; "
+            "'GetNewPage' - Switch to a new tab or window; "
+            "'Mouse' - Move mouse cursor or scroll mouse wheel."
+        )
+    )
+
+    target: str = Field(
+        description=(
+            "Element identifier or selector to target. "
+            "For most actions, this should be the element ID from the page description. "
+            "For Scroll actions, this can be a scroll target description. "
+            "For GoToPage action, this should be the URL."
+        )
+    )
+
+    value: Optional[str] = Field(
+        default=None,
+        description=(
+            "Value to use for the action. "
+            "Required for 'Input' action (text to type), "
+            "'SelectDropdown' action (option text or comma-separated cascade path like 'Category,Subcategory,Item'), "
+            "'Scroll' action (direction 'up' or 'down', with optional scrollType and distance description), "
+            "'KeyboardPress' action (key name like 'Enter', 'Tab', 'Escape', etc.), "
+            "'Upload' action (file path), "
+            "'Sleep' action (duration in milliseconds), "
+            "'Mouse' action (operation type: 'move' for cursor positioning or 'wheel' for scrolling). "
+            "Optional for 'Drag' action (target position description), "
+            "'GetNewPage' action (tab/window identifier). "
+            "Optional for other actions."
+        )
+    )
+
+    description: Optional[str] = Field(
+        default=None,
+        description=(
+            "Optional custom description of what this action is intended to do. "
+            "Helps provide context for the action in test reports."
+        )
+    )
+
+    clear_before_type: bool = Field(
+        default=False,
+        description=(
+            "Whether to clear the input field before typing. "
+            "Only applicable for 'Input' action. "
+            "Set to True to clear existing content before typing new text."
+        )
+    )
+
+
 class UITool(BaseTool):
     """A tool to interact with a UI via a UITester instance."""
 
     name: str = "execute_ui_action"
     description: str = "Executes a UI action using the UITester and returns a structured summary of the new page state."
+    args_schema: Type[BaseModel] = UIActionSchema
     ui_tester_instance: UITester = Field(...)
 
     async def get_full_page_context(
@@ -36,7 +103,7 @@ async def get_full_page_context(
         logging.debug(f"Retrieving page context for analysis (viewport_only={viewport_only})")
         page = self.ui_tester_instance.driver.get_page()
         dp = DeepCrawler(page)
-        await dp.crawl(highlight=True, highlight_text=True, viewport_only=viewport_only)
+        await dp.crawl(highlight=True, filter_text=True, viewport_only=viewport_only)
         page_structure = dp.get_text()
 
         screenshot = None
@@ -77,20 +144,47 @@ async def _arun(
             logging.debug(f"Using custom description: {description}")
 
         # Build the action phrase
-        if action.lower() == "click":
+        if action == "Tap":
             action_phrase = f"Click on the {target}"
-        elif action.lower() == "type":
+        elif action == "Input":
             if clear_before_type:
                 action_phrase = f"Clear the {target} field and then type '{value}'"
                 logging.debug("Using clear-before-type strategy")
             else:
                 action_phrase = f"Type '{value}' in the {target}"
-        elif action.lower() == "selectdropdown":
+        elif action == "SelectDropdown":
             action_phrase = f"From the {target}, select the option '{value}'"
-        elif action.lower() == "scroll":
+        elif action == "Scroll":
             action_phrase = f"Scroll {value or 'down'} on the page"
-        elif action.lower() == "clear":
+        elif action == "Clear":
             action_phrase = f"Clear the content of {target}"
+        elif action == "Hover":
+            action_phrase = f"Hover over {target}"
+        elif action == "KeyboardPress":
+            action_phrase = f"Press the {value} key"
+        elif action == "Upload":
+            action_phrase = f"Upload file {value} to {target}"
+        elif action == "Drag":
+            action_phrase = f"Drag {target}"
+            if value:
+                action_phrase += f" to {value}"
+        elif action == "GoToPage":
+            action_phrase = f"Navigate to {target}"
+        elif action == "GoBack":
+            action_phrase = f"Navigate back to the previous page"
+        elif action == "Sleep":
+            action_phrase = f"Wait for {value or '1000'} milliseconds"
+        elif action == "GetNewPage":
+            action_phrase = f"Switch to new page/tab"
+            if value:
+                action_phrase += f" {value}"
+        elif action == "Mouse":
+            if value and 'move' in value.lower():
+                action_phrase = f"Move mouse cursor to {target}"
+            elif value and 'wheel' in value.lower():
+                action_phrase = f"Scroll mouse wheel on {target}"
+            else:
+                action_phrase = f"Perform mouse action on {target}"
         else:
             action_phrase = f"{action} on {target}"
             if value:
@@ -168,11 +262,28 @@ async def _arun(
             return f"[FAILURE] {error_msg}"
 
 
+class UIAssertionSchema(BaseModel):
+    """Schema for UI assertion tool arguments."""
+
+    assertion: str = Field(
+        description=(
+            "The assertion or validation to perform on the current page state. "
+            "Should be a clear, specific statement of what to verify. "
+            "Examples: "
+            "'The login button should be visible', "
+            "'The error message should contain the text \"Invalid credentials\"', "
+            "'The page title should be \"Dashboard\"', "
+            "'There should be 5 items in the shopping cart'."
+        )
+    )
+
+
 class UIAssertTool(BaseTool):
     """A tool to perform UI assertions via a UITester instance."""
 
     name: str = "execute_ui_assertion"
     description: str = "Performs a UI assertion/validation using the UITester and returns the verification result."
+    args_schema: Type[BaseModel] = UIAssertionSchema
     ui_tester_instance: UITester = Field(...)
 
     def _run(self, assertion: str) -> str: