Skip to content

Commit 4853c51

Browse files
authored
feat: add mouse action and improve descriptions for actions prompt (#79)
* feat: add mouse action and improve descriptions for actions prompt * fix: add Clear, remove Check, update schema and examples * fix: remove Check action
1 parent 50ffa3a commit 4853c51

File tree

3 files changed

+120
-19
lines changed

3 files changed

+120
-19
lines changed

webqa_agent/actions/action_executor.py

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,13 @@ def __init__(self, action_handler):
1414
"Clear": self._execute_clear,
1515
"Scroll": self._execute_scroll,
1616
"KeyboardPress": self._execute_keyboard_press,
17-
"FalsyConditionStatement": self._execute_falsy,
18-
"Check": self._execute_check,
1917
"GetNewPage": self._execute_get_new_page,
2018
"Upload": self._execute_upload,
2119
"SelectDropdown": self._execute_select_dropdown,
2220
"Drag": self._execute_drag,
2321
"GoToPage": self._execute_go_to_page, # Added missing action
2422
"GoBack": self._execute_go_back, # Added browser back navigation
23+
"Mouse": self._execute_mouse, # Added mouse action
2524
}
2625

2726
async def initialize(self):
@@ -146,14 +145,6 @@ async def _execute_keyboard_press(self, action):
146145
else:
147146
return {"success": False, "message": "Keyboard press failed."}
148147

149-
async def _execute_falsy(self, action):
150-
"""Execute falsy condition statement."""
151-
return {"success": True, "message": "Falsy condition met."}
152-
153-
async def _execute_check(self, action):
154-
"""Execute check action."""
155-
return {"success": True, "message": "Check action completed."}
156-
157148
async def _execute_get_new_page(self, action):
158149
"""Execute get new page action."""
159150
success = await self._actions.get_new_page()
@@ -335,4 +326,68 @@ async def _execute_go_back(self, action):
335326
return {"success": False, "message": "Go back action not supported by action handler"}
336327
except Exception as e:
337328
logging.error(f"Go back action failed: {str(e)}")
338-
return {"success": False, "message": f"Go back failed: {str(e)}", "playwright_error": str(e)}
329+
return {"success": False, "message": f"Go back failed: {str(e)}", "playwright_error": str(e)}
330+
331+
async def _execute_mouse(self, action):
332+
"""Unified mouse action supporting move and wheel.
333+
334+
Accepted param formats:
335+
- { op: "move", x: number, y: number }
336+
- { op: "wheel", deltaX: number, deltaY: number }
337+
- Back-compat: if op is omitted, decide by presence of keys
338+
"""
339+
try:
340+
param = action.get("param")
341+
if not param or not isinstance(param, dict):
342+
return {"success": False, "message": "Missing or invalid param for mouse action"}
343+
344+
op = param.get("op")
345+
346+
# Auto-detect if op not provided or empty
347+
if not op:
348+
if "x" in param and "y" in param:
349+
op = "move"
350+
elif "deltaX" in param or "deltaY" in param:
351+
op = "wheel"
352+
else:
353+
return {"success": False, "message": "Missing mouse operation parameters (x/y or deltaX/deltaY)"}
354+
355+
if op == "move":
356+
if not self._validate_params(action, ["param.x", "param.y"]):
357+
return {"success": False, "message": "Missing x or y coordinates for mouse move"}
358+
359+
x = param.get("x")
360+
y = param.get("y")
361+
362+
# Validate coordinates are numbers
363+
if not isinstance(x, (int, float)) or not isinstance(y, (int, float)):
364+
return {"success": False, "message": "x and y coordinates must be numbers"}
365+
366+
success = await self._actions.mouse_move(x, y)
367+
if success:
368+
return {"success": True, "message": f"Mouse moved to ({x}, {y})"}
369+
else:
370+
return {"success": False, "message": "Mouse move action failed"}
371+
372+
elif op == "wheel":
373+
# Default missing keys to 0
374+
dx = param.get("deltaX", 0)
375+
dy = param.get("deltaY", 0)
376+
377+
# Validate deltas are numbers
378+
if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)):
379+
return {"success": False, "message": "deltaX and deltaY must be numbers"}
380+
381+
success = await self._actions.mouse_wheel(dx, dy)
382+
if success:
383+
return {"success": True, "message": f"Mouse wheel scrolled (deltaX: {dx}, deltaY: {dy})"}
384+
else:
385+
return {"success": False, "message": "Mouse wheel action failed"}
386+
387+
else:
388+
logging.error(f"Unknown mouse op: {op}. Expected 'move' or 'wheel'.")
389+
return {"success": False, "message": f"Unknown mouse operation: {op}. Expected 'move' or 'wheel'"}
390+
391+
except Exception as e:
392+
logging.error(f"Mouse action execution failed: {str(e)}")
393+
return {"success": False, "message": f"Mouse action failed with an exception: {e}"}

webqa_agent/actions/action_handler.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,3 +1429,30 @@ async def drag(self, source_coords, target_coords):
14291429
except Exception as e:
14301430
logging.error(f'Drag action failed: {str(e)}')
14311431
return False
1432+
1433+
async def mouse_move(self, x: int | float, y: int | float) -> bool:
1434+
"""Move mouse to absolute coordinates (x, y)."""
1435+
try:
1436+
# Coerce to numbers in case strings are provided
1437+
target_x = float(x)
1438+
target_y = float(y)
1439+
await self.page.mouse.move(target_x, target_y)
1440+
logging.info(f"mouse move to ({target_x}, {target_y})")
1441+
await asyncio.sleep(0.1)
1442+
return True
1443+
except Exception as e:
1444+
logging.error(f"Mouse move failed: {str(e)}")
1445+
return False
1446+
1447+
async def mouse_wheel(self, delta_x: int | float = 0, delta_y: int | float = 0) -> bool:
1448+
"""Scroll the mouse wheel by delta values."""
1449+
try:
1450+
dx = float(delta_x) if delta_x is not None else 0.0
1451+
dy = float(delta_y) if delta_y is not None else 0.0
1452+
await self.page.mouse.wheel(dx, dy)
1453+
logging.info(f"mouse wheel by (deltaX={dx}, deltaY={dy})")
1454+
await asyncio.sleep(0.1)
1455+
return True
1456+
except Exception as e:
1457+
logging.error(f"Mouse wheel failed: {str(e)}")
1458+
return False

webqa_agent/llm/prompt.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class LLMPrompt:
5151
5252
## Anchor Usage Rule
5353
Anchors are strictly used for reference during disambiguation.
54-
**NEVER** interact (Tap/Hover/Check) with anchor elements directly.
54+
**NEVER** interact (Tap/Hover) with anchor elements directly.
5555
5656
## Scroll Behavior Constraints
5757
- Avoid planning `Scroll` if the page is already at the bottom.
@@ -97,6 +97,8 @@ class LLMPrompt:
9797
* `value` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
9898
* For Input actions, if the page or validation message requires a minimum length, the value you generate MUST strictly meet or exceed this length. For Chinese, count each character as 1.
9999
* `clear_before_type`: Set to `true` if the instruction explicitly says to 'clear' the field before typing, or if you are correcting a previous failed input. Defaults to `false`.
100+
- type: 'Clear', clear the content of an input field
101+
* {{ locate: {{ id: string }}, param: null }}
100102
- type: 'KeyboardPress', press a key
101103
* {{ param: {{ value: string }} }}
102104
- type: 'Upload', upload a file (or click the upload button)
@@ -124,9 +126,6 @@ class LLMPrompt:
124126
* use this action when you need to go back to the previous page in the browser history, similar to clicking the browser's back button.
125127
- type: 'Sleep'
126128
* {{ param: {{ timeMs: number }} }}
127-
- type: 'Check'
128-
* {{ param: null }}
129-
* use this action when the instruction is a "check" or "verify" or "validate" statement.
130129
- type: 'Drag', drag an slider or element from source to target position
131130
For Drag action, use the following format:
132131
{
@@ -152,6 +151,21 @@ class LLMPrompt:
152151
* selection_path is the text of the option to be selected.
153152
* if the selection_path is a string, it means the option is the first level of the dropdown.
154153
* if the selection_path is a list, it means the option is the nth level of the dropdown.
154+
- type: 'Mouse', unified mouse action for move and wheel
155+
{
156+
"param": {
157+
"op": 'move' | 'wheel',
158+
// move operation
159+
"x"?: number,
160+
"y"?: number,
161+
// wheel operation
162+
"deltaX"?: number,
163+
"deltaY"?: number
164+
},
165+
"locate": null
166+
}
167+
* When op is omitted, auto-detect by provided fields: x+y => move; deltaX/deltaY => wheel.
168+
155169
156170
## Further Plan Format
157171
If the task isn't completed:
@@ -178,13 +192,19 @@ class LLMPrompt:
178192
179193
### Supported Actions:
180194
- Tap: Click on a specified page element (such as a button or link). Typically used to trigger a click event.
195+
- Hover: Move the mouse over a specified page element (such as a button or link). Typically used to show tooltip or hover effect.
181196
- Scroll: Scroll the page or a specific region. You can specify the direction (down, up), the scroll distance, or scroll to the edge of the page/region.
182197
- Input: Enter text into an input field or textarea. This action will replace the current value with the specified final value.
198+
- Clear: Clear the content of an input field. Requires the input's external id in locate.
183199
- Sleep: Wait for a specified amount of time (in milliseconds). Useful for waiting for page loads or asynchronous content to render.
184200
- Upload: Upload a file
185201
- KeyboardPress: Simulate a keyboard key press, such as Enter, Tab, or arrow keys.
186202
- Drag: Perform a drag-and-drop operation. Moves the mouse from a starting coordinate to a target coordinate, often used for sliders, sorting, or drag-and-drop interfaces. Requires both source and target coordinates.
187203
- SelectDropdown: Select an option from a dropdown menu which is user's expected option. The dropdown element is the first level of the dropdown menu. IF You can see the dropdown element, you cannot click the dropdown element, you should directly select the option.
204+
- GoToPage: Navigate directly to a specific URL. Useful for returning to the homepage, navigating to known pages, or entering a new web address. Requires a URL parameter.
205+
- GoBack: Navigate back to the previous page in the browser history, similar to clicking the browser's back button. Does not require any parameters.
206+
- GetNewPage: Get the new page or open in new tab or open in new window. Use this action when the previous action (e.g., clicking a link that opens in a new tab) creates a new browser context that needs to be accessed.
207+
- Mouse: Unified mouse action for move and wheel.
188208
189209
Please ensure the output is a valid **JSON** object. Do **not** include any markdown, backticks, or code block indicators.
190210
@@ -193,7 +213,7 @@ class LLMPrompt:
193213
"actions": [
194214
{
195215
"thought": "Reasoning for this action and why it's feasible on the current page.",
196-
"type": "Tap" | "Scroll" | "Input" | "Sleep" | "Check" | "Upload" | "KeyboardPress" | "Drag" | "SelectDropdown" | "GoToPage" | "GoBack",
216+
"type": "Tap" | "Hover" | "Scroll" | "Input" | "Clear" | "Sleep" | "Upload" | "KeyboardPress" | "Drag" | "SelectDropdown" | "GoToPage" | "GoBack" | "GetNewPage" | "Mouse",
197217
"param": {...} | null,
198218
"locate": {...} | null
199219
}
@@ -216,13 +236,12 @@ class LLMPrompt:
216236
- If mismatched: `targetVerified: false` and include error: "Planned element does not match the user's expected target"
217237
- If an expected element is not found on the page:
218238
- For imperative instruction: return `error` and empty actions.
219-
- For tolerant instructions like "If popup exists, close it", return `FalsyConditionStatement` action.
220239
221240
---
222241
223242
### Unified Few-shot Examples
224243
225-
#### Example 1: Tap + Sleep + Check (task incomplete)
244+
#### Example 1: Tap + Sleep (task incomplete)
226245
"Click send button and wait 50s"
227246
228247
====================
@@ -256,7 +275,7 @@ class LLMPrompt:
256275
}
257276
```
258277
259-
#### Example 2: Scroll + Check (scroll history aware)
278+
#### Example 2: Scroll (scroll history aware)
260279
```json
261280
{
262281
"actions": [

0 commit comments

Comments
 (0)