diff --git a/src/android.ts b/src/android.ts index 31d40df..dde5939 100644 --- a/src/android.ts +++ b/src/android.ts @@ -18,6 +18,11 @@ interface UiAutomatorXmlNode { bounds?: string; hint?: string; focused?: string; + clickable?: string; + focusable?: string; + enabled?: string; + selected?: string; + package?: string; "content-desc"?: string; "resource-id"?: string; } @@ -298,9 +303,15 @@ export class AndroidRobot implements Robot { } } - if (node.text || node["content-desc"] || node.hint) { + // Include elements with text/labels OR clickable/focusable elements (like icons, buttons) + const hasTextOrLabel = node.text || node["content-desc"] || node.hint || node["resource-id"]; + const isInteractive = node.clickable === "true" || node.focusable === "true" || + (node.class && (node.class.includes("Button") || node.class.includes("ImageView") || + node.class.includes("ImageButton") || node.class.includes("View"))); + + if (hasTextOrLabel || isInteractive) { const element: ScreenElement = { - type: node.class || "text", + type: node.class || "element", text: node.text, label: node["content-desc"] || node.hint || "", rect: this.getScreenElementRect(node), diff --git a/src/server.ts b/src/server.ts index ec1ee36..eaf7e50 100644 --- a/src/server.ts +++ b/src/server.ts @@ -529,5 +529,66 @@ export const createMcpServer = (): McpServer => { } ); + tool( + "mobile_tap_element", + "Find an element on screen by query and tap it. This combines list_elements and tap functionality.", + { + device: z.string().describe("The device identifier to use. Use mobile_list_available_devices to find which devices are available to you."), + query: z.string().describe("Search query to find the element (matches against text, label, name, value, or identifier)") + }, + async ({ device, query }) => { + const robot = getRobotFromDevice(device); + const elements = await robot.getElementsOnScreen(); + + // Find all matching elements by searching text, label, name, value, and identifier + const matchingElements = elements.filter(element => { + const searchFields = [ + element.text, + element.label, + element.name, + element.value, + element.identifier + ].filter(field => field && field.trim() !== ""); + + return searchFields.some(field => + field && field.toLowerCase().includes(query.toLowerCase()) + ); + }); + + if (matchingElements.length === 0) { + throw new ActionableError(`No element found matching query: "${query}". Available elements: ${elements.map(e => e.text || e.label || e.name || e.value || e.identifier).filter(t => t).join(", ")}`); + } + + if (matchingElements.length > 1) { + const matchingElementsJson = matchingElements.map(element => ({ + type: element.type, + text: element.text, + label: element.label, + name: element.name, + value: element.value, + identifier: element.identifier, + coordinates: { + x: element.rect.x + (element.rect.width / 2), + y: element.rect.y + (element.rect.height / 2) + }, + rect: element.rect + })); + + throw new ActionableError(`Multiple elements found matching query: "${query}". Found ${matchingElements.length} matches:\n${JSON.stringify(matchingElementsJson, null, 2)}`); + } + + const matchingElement = matchingElements[0]; + + // Calculate center coordinates of the element + const centerX = matchingElement.rect.x + (matchingElement.rect.width / 2); + const centerY = matchingElement.rect.y + (matchingElement.rect.height / 2); + + // Tap the element + await robot.tap(centerX, centerY); + + return `Tapped element "${matchingElement.text || matchingElement.label || matchingElement.name || matchingElement.value || matchingElement.identifier}" at coordinates: ${centerX}, ${centerY}`; + } + ); + return server; };