diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts index 9bacf7d04..0a9abc162 100644 --- a/packages/core/src/ai-model/common.ts +++ b/packages/core/src/ai-model/common.ts @@ -5,8 +5,10 @@ import type { MidsceneYamlFlowItem, PlanningAction, PlanningActionParamInputOrKeyPress, + PlanningActionParamLongPress, PlanningActionParamScroll, PlanningActionParamSleep, + PlanningActionParamSwipe, Rect, Size, } from '@/types'; @@ -363,6 +365,24 @@ export function buildYamlFlowFromPlans( scrollType: param.scrollType, distance: param.distance, }); + } else if (type === 'Swipe') { + const param = plan.param as PlanningActionParamSwipe; + flow.push({ + aiSwipe: null, + locate, + from: param.from, + to: param.to, + duration: param.duration, + direction: param.direction, + swipeType: param.swipeType, + distance: param.distance, + }); + } else if (type === 'LongPress') { + const param = plan.param as PlanningActionParamLongPress; + flow.push({ + aiLongPress: locate!, + duration: param.duration, + }); } else if (type === 'Sleep') { const param = plan.param as PlanningActionParamSleep; flow.push({ diff --git a/packages/core/src/ai-model/prompt/llm-planning.ts b/packages/core/src/ai-model/prompt/llm-planning.ts index 7586c12f8..b9f15cc2e 100644 --- a/packages/core/src/ai-model/prompt/llm-planning.ts +++ b/packages/core/src/ai-model/prompt/llm-planning.ts @@ -25,7 +25,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i Restriction: - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something. -- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === 'android' ? ', AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull.' : '.'} +- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, LongPress, Swipe${pageType === 'android' ? ', AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull.' : '.'} - Don't repeat actions in the previous logs. - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}. @@ -35,6 +35,8 @@ Supporting actions: - Hover: { type: "Hover", ${vlLocateParam} } - Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field. - KeyboardPress: { type: "KeyboardPress", param: { value: string } } +- LongPress: { type: "LongPress", ${vlLocateParam}, param: { duration?: number(ms) } } +- Swipe: { type: "Swipe", ${vlLocateParam} | null, param: { from?: { x: number, y: number }, to?: { x: number, y: number }, duration?: number(ms), direction: 'down' | 'up' | 'right' | 'left'(default), swipeType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance?: number } } // locate is the element to swipe. For a page-level swipe, set \`locate\` to \`null\`. - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field. ${ pageType === 'android' @@ -95,7 +97,7 @@ You are a versatile professional in software UI automation. Your outstanding con ## Workflow 1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs. -2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === 'android' ? '/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull' : ''}). The "About the action" section below will give you more details. +2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / LongPress / Swipe / FalsyConditionStatement / Sleep ${pageType === 'android' ? '/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull' : ''}). The "About the action" section below will give you more details. 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action. 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps. 5. Consider whether the user's instruction will be accomplished after all the actions @@ -149,6 +151,31 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed: * use this action when the conditional statement talked about in the instruction is falsy. - type: 'Sleep' * {{ param: {{ timeMs: number }} }} +- type: 'LongPress', trigger a long press on the screen at specified coordinates + * {{ ${llmLocateParam}, param: {{ duration?: number(ms) }} }} +- type: 'Swipe', trigger a swipe gesture from one point to another on the screen + * {{ + ${llmLocateParam}, + param: {{ + from?: {{ x: number, y: number }}, + to?: {{ x: number, y: number }}, + duration?: number(ms), + direction?: 'down' | 'up' | 'right' | 'left', + swipeType?: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', + distance?: number + }} + }} + * To swipe a specific element, put the element at the center of the region in the \`locate\` field. + For a page-level swipe, set \`locate\` to \`null\`. + * If the user specifies an element, use its center as the \`from\` position. + If \`from\` and \`to\` are specified, use them directly. + * If \`to\` cannot be extracted, only extract \`direction\` and \`swipeType\` and \`distance\` for later use. + * Default values: + - \`direction\`: \`'left'\` + - \`swipeType\`: \`'untilLeft'\` + * \`swipeType\` describes swipe behavior: + - 'once' + - 'untilBottom', 'untilTop', 'untilRight', 'untilLeft' ${ pageType === 'android' ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices @@ -284,7 +311,7 @@ export const planSchema: ResponseFormatJSONSchema = { type: { type: 'string', description: - 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"', + 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "Swipe", "LongPress", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"', }, param: { anyOf: [ @@ -311,6 +338,47 @@ export const planSchema: ResponseFormatJSONSchema = { required: ['direction', 'scrollType', 'distance'], additionalProperties: false, }, + { + type: 'object', + properties: { + duration: { type: ['number', 'string'] }, + from: { + anyOf: [ + { + type: 'object', + properties: { + x: { type: ['number', 'string'] }, + y: { type: ['number', 'string'] }, + }, + required: ['x', 'y'], + additionalProperties: false, + }, + { type: 'null' } + ] + }, + to: { + anyOf: [ + { + type: 'object', + properties: { + x: { type: ['number', 'string'] }, + y: { type: ['number', 'string'] }, + }, + required: ['x', 'y'], + additionalProperties: false, + }, + { type: 'null' } + ] + } + }, + additionalProperties: false + }, + + { + type: 'object', + properties: { duration: { type: ['number', 'string'] } }, + additionalProperties: false, + }, { type: 'object', properties: { reason: { type: 'string' } }, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 0d5607e45..bf5ab96e5 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -265,6 +265,8 @@ export interface PlanningAction { | 'Input' | 'KeyboardPress' | 'Scroll' + | 'Swipe' + | 'LongPress' | 'Error' | 'ExpectedFalsyCondition' | 'Assert' @@ -309,6 +311,23 @@ export interface PlanningActionParamInputOrKeyPress { export type PlanningActionParamScroll = scrollParam; +export interface PlanningActionParamLongPress { + duration?: number; +} +export interface PlanningActionParamSwipe { + from?: { + x: number; + y: number; + }; + to?: { + x: number; + y: number; + }; + duration?: number; + direction: 'down' | 'up' | 'right' | 'left'; + swipeType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft'; + distance?: number; +} export interface PlanningActionParamAssert { assertion: TUserPrompt; } @@ -510,7 +529,7 @@ export type ExecutionTaskInsightAssertion = ExecutionTask; /* -task - action (i.e. interact) +task - action (i.e. interact) */ export type ExecutionTaskActionApply = ExecutionTaskApply< 'Action', diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index 8c3cf8134..023ef7d7e 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -1,4 +1,4 @@ -import type { PlanningActionParamScroll, Rect, TUserPrompt } from './types'; +import type { PlanningActionParamScroll, Rect, TUserPrompt, PlanningActionParamLongPress, PlanningActionParamSwipe } from './types'; import type { BaseElement, UIContext } from './types'; export interface LocateOption { @@ -155,13 +155,17 @@ export interface MidsceneYamlFlowItemAIKeyboardPress extends LocateOption { locate?: TUserPrompt; // where to press, optional } -export interface MidsceneYamlFlowItemAIScroll - extends LocateOption, - PlanningActionParamScroll { +export interface MidsceneYamlFlowItemAIScroll extends LocateOption, PlanningActionParamScroll { aiScroll: null; locate?: TUserPrompt; // which area to scroll, optional } - +export interface MidsceneYamlFlowItemAILongPress extends LocateOption, PlanningActionParamLongPress { + aiLongPress: TUserPrompt; +} +export interface MidsceneYamlFlowItemAISwipe extends LocateOption, PlanningActionParamSwipe{ + aiSwipe: null; + locate?: TUserPrompt; // where to swipe, optional +} export interface MidsceneYamlFlowItemEvaluateJavaScript { javascript: string; name?: string; @@ -187,6 +191,8 @@ export type MidsceneYamlFlowItem = | MidsceneYamlFlowItemAIInput | MidsceneYamlFlowItemAIKeyboardPress | MidsceneYamlFlowItemAIScroll + | MidsceneYamlFlowItemAISwipe + | MidsceneYamlFlowItemAILongPress | MidsceneYamlFlowItemSleep | MidsceneYamlFlowItemLogScreenshot; diff --git a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap index 2c6597bc4..66010063f 100644 --- a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap +++ b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap @@ -456,6 +456,78 @@ exports[`system prompts > planning - 4o - response format 1`] = ` ], "type": "object", }, + { + "additionalProperties": false, + "properties": { + "duration": { + "type": [ + "number", + "string", + ], + }, + "from": { + "additionalProperties": false, + "properties": { + "x": { + "type": [ + "number", + "string", + ], + }, + "y": { + "type": [ + "number", + "string", + ], + }, + }, + "required": [ + "x", + "y", + ], + "type": "object", + }, + "to": { + "additionalProperties": false, + "properties": { + "x": { + "type": [ + "number", + "string", + ], + }, + "y": { + "type": [ + "number", + "string", + ], + }, + }, + "required": [ + "x", + "y", + ], + "type": "object", + }, + }, + "required": [ + "from", + "to", + ], + "type": "object", + }, + { + "additionalProperties": false, + "properties": { + "duration": { + "type": [ + "number", + "string", + ], + }, + }, + "type": "object", + }, { "additionalProperties": false, "properties": { @@ -488,7 +560,7 @@ exports[`system prompts > planning - 4o - response format 1`] = ` "type": "string", }, "type": { - "description": "Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"", + "description": "Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "Swipe", "LongPress", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"", "type": "string", }, }, @@ -534,161 +606,13 @@ exports[`system prompts > planning - 4o - response format 1`] = ` } `; -exports[`system prompts > planning - 4o 1`] = ` -" -## Role - -You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users. - -## Objective - -- Decompose the instruction user asked into a series of actions -- Locate the target element if possible -- If the instruction cannot be accomplished, give a further plan. - -## Workflow - -1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs. -2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ). The "About the action" section below will give you more details. -3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action. -4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps. -5. Consider whether the user's instruction will be accomplished after all the actions - - If yes, set \`taskWillBeAccomplished\` to true - - If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details. - -## Constraints - -- All the actions you composed MUST be based on the page context information you get. -- Trust the "What have been done" field about the task (if any), don't repeat actions in it. -- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`. -- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field. - -## About the \`actions\` field - -The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme: - -type LocateParam = { - "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description. - "prompt"?: string // the description of the element to find. It can only be omitted when locate is null. -} | null // If it's not on the page, the LocateParam should be null - -## Supported actions - -Each action has a \`type\` and corresponding \`param\`. To be detailed: -- type: 'Tap' - * { locate: {"id": string, "prompt": string} | null } -- type: 'RightClick' - * { locate: {"id": string, "prompt": string} | null } -- type: 'Hover' - * { locate: {"id": string, "prompt": string} | null } -- type: 'Input', replace the value in the input field - * { locate: {"id": string, "prompt": string} | null, param: { value: string } } - * \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done. -- type: 'KeyboardPress', press a key - * { param: { value: string } } -- type: 'Scroll', scroll up or down. - * { - locate: {"id": string, "prompt": string} | null, - param: { - direction: 'down'(default) | 'up' | 'right' | 'left', - scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', - distance: null | number - } - } - * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field. - * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance. - * { param: { button: 'Back' | 'Home' | 'RecentApp' } } -- type: 'ExpectedFalsyCondition' - * { param: { reason: string } } - * use this action when the conditional statement talked about in the instruction is falsy. -- type: 'Sleep' - * { param: { timeMs: number } } - - - - -## Output JSON Format: - -The JSON format is as follows: - -{ - "actions": [ - // ... some actions - ], - "log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction. - "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction. - "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false. -} - -## Examples - -### Example: Decompose a task - -When the instruction is 'Click the language switch button, wait 1s, click "English"', and not log is provided - -By viewing the page screenshot and description, you should consider this and output the JSON: - -* The main steps should be: tap the switch button, sleep, and tap the 'English' option -* The language switch button is shown in the screenshot, but it's not marked with a rectangle. So we have to use the page description to find the element. By carefully checking the context information (coordinates, attributes, content, etc.), you can find the element. -* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this. -* Log what these action do: Click the language switch button to open the language options. Wait for 1 second. -* The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true. - -{ - "actions":[ - { - "type": "Tap", - "thought": "Click the language switch button to open the language options.", - "param": null, - "locate": { id: "c81c4e9a33", prompt: "The language switch button" }, - }, - { - "type": "Sleep", - "thought": "Wait for 1 second to ensure the language options are displayed.", - "param": { "timeMs": 1000 }, - } - ], - "error": null, - "more_actions_needed_by_instruction": true, - "log": "Click the language switch button to open the language options. Wait for 1 second", -} - -### Example: What NOT to do -Wrong output: -{ - "actions":[ - { - "type": "Tap", - "thought": "Click the language switch button to open the language options.", - "param": null, - "locate": { - { "id": "c81c4e9a33" }, // WRONG: prompt is missing - } - }, - { - "type": "Tap", - "thought": "Click the English option", - "param": null, - "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished - } - ], - "more_actions_needed_by_instruction": false, // WRONG: should be true - "log": "Click the language switch button to open the language options", -} - -Reason: -* The \`prompt\` is missing in the first 'Locate' action -* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true -" -`; - exports[`system prompts > planning - android 1`] = ` " Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. Restriction: - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something. -- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull. +- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, LongPress, Swipe, AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull. - Don't repeat actions in the previous logs. - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing 2d bounding box as [xmin, ymin, xmax, ymax]. @@ -698,6 +622,8 @@ Supporting actions: - Hover: { type: "Hover", locate: {bbox: [number, number, number, number], prompt: string } } - Input: { type: "Input", locate: {bbox: [number, number, number, number], prompt: string }, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field. - KeyboardPress: { type: "KeyboardPress", param: { value: string } } +- LongPress: { type: "LongPress", locate: {bbox: [number, number, number, number], prompt: string }, param: { duration?: number } } +- Swipe: { type: "Swipe", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { from?: { x: number, y: number }, to?: { x: number, y: number }, duration?: number, direction: 'down' | 'up' | 'right' | 'left'(default), swipeType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance?: number } } // locate is the element to swipe. For a page-level swipe, set \`locate\` to \`null\`. - Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field. - AndroidBackButton: { type: "AndroidBackButton", param: {} } - AndroidHomeButton: { type: "AndroidHomeButton", param: {} } @@ -767,7 +693,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i Restriction: - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something. -- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll. +- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, LongPress, Swipe. - Don't repeat actions in the previous logs. - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing 2d bounding box as [ymin, xmin, ymax, xmax]. @@ -777,6 +703,8 @@ Supporting actions: - Hover: { type: "Hover", locate: {bbox: [number, number, number, number], prompt: string } } - Input: { type: "Input", locate: {bbox: [number, number, number, number], prompt: string }, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field. - KeyboardPress: { type: "KeyboardPress", param: { value: string } } +- LongPress: { type: "LongPress", locate: {bbox: [number, number, number, number], prompt: string }, param: { duration?: number } } +- Swipe: { type: "Swipe", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { from?: { x: number, y: number }, to?: { x: number, y: number }, duration?: number, direction: 'down' | 'up' | 'right' | 'left'(default), swipeType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance?: number } } // locate is the element to swipe. For a page-level swipe, set \`locate\` to \`null\`. - Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field. @@ -822,7 +750,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i Restriction: - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something. -- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll. +- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, LongPress, Swipe. - Don't repeat actions in the previous logs. - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing 2d bounding box as [xmin, ymin, xmax, ymax]. @@ -832,6 +760,8 @@ Supporting actions: - Hover: { type: "Hover", locate: {bbox: [number, number, number, number], prompt: string } } - Input: { type: "Input", locate: {bbox: [number, number, number, number], prompt: string }, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field. - KeyboardPress: { type: "KeyboardPress", param: { value: string } } +- LongPress: { type: "LongPress", locate: {bbox: [number, number, number, number], prompt: string }, param: { duration?: number } } +- Swipe: { type: "Swipe", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { from?: { x: number, y: number }, to?: { x: number, y: number }, duration?: number, direction: 'down' | 'up' | 'right' | 'left'(default), swipeType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance?: number } } // locate is the element to swipe. For a page-level swipe, set \`locate\` to \`null\`. - Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field. diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts index 3bbeecd73..0fca2f011 100644 --- a/packages/web-integration/src/chrome-extension/page.ts +++ b/packages/web-integration/src/chrome-extension/page.ts @@ -675,4 +675,143 @@ export default class ChromeExtensionProxyPage implements AbstractPage { await this.detachDebugger(); this.destroyed = true; } + async longPress ( + x: number, + y: number, + duration?: number, + ) { + duration = duration || 500; + const LONG_PRESS_THRESHOLD = 600; + const MIN_PRESS_THRESHOLD = 300; + if (duration > LONG_PRESS_THRESHOLD) { + duration = LONG_PRESS_THRESHOLD; + } + if (duration < MIN_PRESS_THRESHOLD) { + duration = MIN_PRESS_THRESHOLD; + } + await this.mouse.move(x, y); + + if (this.isMobileEmulation === null) { + const result = await this.sendCommandToDebugger('Runtime.evaluate', { + expression: `(() => { + return /Android|iPhone|iPad|iPod|Mobile/i.test(navigator.userAgent); + })()`, + returnByValue: true, + }); + this.isMobileEmulation = result?.result?.value; + } + + if (this.isMobileEmulation) { + const touchPoints = [{ x: Math.round(x), y: Math.round(y) }]; + await this.sendCommandToDebugger('Input.dispatchTouchEvent', { + type: 'touchStart', + touchPoints, + modifiers: 0, + }); + await new Promise((res) => setTimeout(res, duration)); + await this.sendCommandToDebugger('Input.dispatchTouchEvent', { + type: 'touchEnd', + touchPoints: [], + modifiers: 0, + }); + } else { + await this.sendCommandToDebugger('Input.dispatchMouseEvent', { + type: 'mousePressed', + x, + y, + button: 'left', + clickCount: 1, + }); + await new Promise((res) => setTimeout(res, duration)); + await this.sendCommandToDebugger('Input.dispatchMouseEvent', { + type: 'mouseReleased', + x, + y, + button: 'left', + clickCount: 1, + }); + } + this.latestMouseX = x; + this.latestMouseY = y; + }; + async swipe( + from: { x: number; y: number }, + to: { x: number; y: number }, + duration?: number + ) { + const LONG_PRESS_THRESHOLD = 500; + const MIN_PRESS_THRESHOLD = 150; + duration = duration || 300; + if (duration < MIN_PRESS_THRESHOLD) { + duration = MIN_PRESS_THRESHOLD; + } + if (duration > LONG_PRESS_THRESHOLD) { + duration = LONG_PRESS_THRESHOLD; + } + + if (this.isMobileEmulation === null) { + const result = await this.sendCommandToDebugger('Runtime.evaluate', { + expression: `(() => { + return /Android|iPhone|iPad|iPod|Mobile/i.test(navigator.userAgent); + })()`, + returnByValue: true, + }); + this.isMobileEmulation = result?.result?.value; + } + + const steps = 30; + const delay = duration / steps; + + if (this.isMobileEmulation) { + await this.sendCommandToDebugger('Input.dispatchTouchEvent', { + type: 'touchStart', + touchPoints: [{ x: Math.round(from.x), y: Math.round(from.y) }], + modifiers: 0, + }); + + for (let i = 1; i <= steps; i++) { + const x = from.x + (to.x - from.x) * (i / steps); + const y = from.y + (to.y - from.y) * (i / steps); + await this.sendCommandToDebugger('Input.dispatchTouchEvent', { + type: 'touchMove', + touchPoints: [{ x: Math.round(x), y: Math.round(y) }], + modifiers: 0, + }); + await new Promise((res) => setTimeout(res, delay)); + } + + await this.sendCommandToDebugger('Input.dispatchTouchEvent', { + type: 'touchEnd', + touchPoints: [], + modifiers: 0, + }); + } else { + await this.mouse.move(from.x, from.y); + await this.sendCommandToDebugger('Input.dispatchMouseEvent', { + type: 'mousePressed', + x: from.x, + y: from.y, + button: 'left', + clickCount: 1, + }); + + for (let i = 1; i <= steps; i++) { + const x = from.x + (to.x - from.x) * (i / steps); + const y = from.y + (to.y - from.y) * (i / steps); + await this.mouse.move(x, y); + await new Promise((res) => setTimeout(res, delay)); + } + + await this.sendCommandToDebugger('Input.dispatchMouseEvent', { + type: 'mouseReleased', + x: to.x, + y: to.y, + button: 'left', + clickCount: 1, + }); + } + + this.latestMouseX = to.x; + this.latestMouseY = to.y; + }; } diff --git a/packages/web-integration/src/common/agent.ts b/packages/web-integration/src/common/agent.ts index 9fb32bc26..0a590c6e3 100644 --- a/packages/web-integration/src/common/agent.ts +++ b/packages/web-integration/src/common/agent.ts @@ -21,6 +21,8 @@ import { type MidsceneYamlScript, type OnTaskStartTip, type PlanningActionParamScroll, + type PlanningActionParamSwipe, + type PlanningActionParamLongPress, type Rect, type TUserPrompt, } from '@midscene/core'; @@ -54,6 +56,7 @@ import { scrollParamStr, taskTitleStr, typeStr, + swipeParamStr, } from './ui-utils'; import { getReportFileName, printReportMsg } from './utils'; import { parseContextFromWebPage } from './utils'; @@ -431,6 +434,41 @@ export class PageAgent { return output; } + async aiLongPress(locatePrompt: TUserPrompt, opt?: LocateOption, longPressParam?: PlanningActionParamLongPress) { + const detailedLocateParam = this.buildDetailedLocateParam( + locatePrompt, + opt, + ); + const plans = buildPlans('LongPress', detailedLocateParam, longPressParam); + const { executor, output } = await this.taskExecutor.runPlans( + taskTitleStr('LongPress', locateParamStr(detailedLocateParam)), + plans, + { cacheable: opt?.cacheable }, + ); + await this.afterTaskRunning(executor); + return output; + } + + async aiSwipe( + swipeParam: PlanningActionParamSwipe, + locatePrompt?: TUserPrompt, + opt?: LocateOption, + ) { + const detailedLocateParam = locatePrompt + ? this.buildDetailedLocateParam(locatePrompt, opt) + : undefined; + const plans = buildPlans('Swipe', detailedLocateParam, swipeParam); + const paramInTitle = locatePrompt + ? `${locateParamStr(detailedLocateParam)} - ${swipeParamStr(swipeParam)}` + : swipeParamStr(swipeParam); + const { executor, output } = await this.taskExecutor.runPlans( + taskTitleStr('Swipe', paramInTitle), + plans, + { cacheable: opt?.cacheable }, + ); + await this.afterTaskRunning(executor); + return output; + } async aiAction( taskPrompt: string, opt?: { diff --git a/packages/web-integration/src/common/plan-builder.ts b/packages/web-integration/src/common/plan-builder.ts index ade992b7d..a7216a07e 100644 --- a/packages/web-integration/src/common/plan-builder.ts +++ b/packages/web-integration/src/common/plan-builder.ts @@ -7,6 +7,8 @@ import type { PlanningActionParamSleep, PlanningActionParamTap, PlanningLocateParam, + PlanningActionParamLongPress, + PlanningActionParamSwipe, } from '@midscene/core'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; @@ -19,7 +21,9 @@ export function buildPlans( param?: | PlanningActionParamInputOrKeyPress | PlanningActionParamScroll - | PlanningActionParamSleep, + | PlanningActionParamSleep + | PlanningActionParamLongPress + | PlanningActionParamSwipe ): PlanningAction[] { let returnPlans: PlanningAction[] = []; const locatePlan: PlanningAction | null = locateParam @@ -42,6 +46,18 @@ export function buildPlans( returnPlans = [locatePlan, tapPlan]; } + if (type === 'LongPress') { + assert(locateParam, `missing locate info for action "${type}"`); + assert(locatePlan, `missing locate info for action "${type}"`); + const longPressPlan: PlanningAction = { + type, + param: param as PlanningActionParamLongPress, + thought: '', + locate: locateParam, + }; + + returnPlans = [locatePlan, longPressPlan]; + } if (type === 'Input' || type === 'KeyboardPress') { if (type === 'Input') { assert(locateParam, `missing locate info for action "${type}"`); @@ -79,6 +95,23 @@ export function buildPlans( } } + if (type === 'Swipe') { + assert(param, `missing param for action "${type}"`); + + const swipePlan: PlanningAction = { + type, + param: param as PlanningActionParamSwipe, + thought: '', + locate: locateParam, + }; + + if (locatePlan) { + returnPlans = [locatePlan, swipePlan]; + } else { + returnPlans = [swipePlan]; + } + } + if (type === 'Sleep') { assert(param, `missing param for action "${type}"`); diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts index b232a831d..a7f52c78e 100644 --- a/packages/web-integration/src/common/tasks.ts +++ b/packages/web-integration/src/common/tasks.ts @@ -34,6 +34,8 @@ import { type PlanningActionParamScroll, type PlanningActionParamSleep, type PlanningActionParamTap, + type PlanningActionParamSwipe, + type PlanningActionParamLongPress, type PlanningActionParamWaitFor, type TMultimodalPrompt, type TUserPrompt, @@ -613,6 +615,107 @@ export class PageTaskExecutor { }, }; tasks.push(taskActionScroll); + } else if (plan.type === 'Swipe') { + const taskActionSwipe: ExecutionTaskActionApply = { + type: 'Action', + subType: 'Swipe', + param: plan.param, + thought: plan.thought, + locate: plan.locate, + executor: async (taskParam, { element }) => { + assert(!isAndroidPage(this.page), 'Cannot use swipe on Android devices'); + + const { + direction, + swipeType = 'untilLeft', + distance, + duration = 300, + from: fromParam, + to: toParam, + } = taskParam || {}; + + const { width, height } = await this.page.size(); + const from = element + ? { x: element.center[0], y: element.center[1] } + : (fromParam ?? { x: width / 2, y: height / 2 }); + + if (toParam) { + await this.page.swipe(from, toParam, duration); + return; + } + + const moveDistance = (() => { + if (distance != null) { + return distance; + } + if (swipeType === 'once') { + if (!direction) { + throw new Error('direction is required for swipeType "once"') + }; + return direction === 'up' || direction === 'down' ? height / 2 : width / 2; + } + return width / 2; + })(); + + let to: { x: number; y: number }; + switch (swipeType) { + case 'once': + switch (direction) { + case 'up': + to = { x: from.x, y: from.y - moveDistance }; + break; + case 'down': + to = { x: from.x, y: from.y + moveDistance }; + break; + case 'left': + to = { x: from.x - moveDistance, y: from.y }; + break; + case 'right': + to = { x: from.x + moveDistance, y: from.y }; + break; + default: throw new Error(`Unknown direction: ${direction}`); + } + break; + case 'untilTop': + to = { x: from.x, y: 0 }; + break; + case 'untilBottom': + to = { x: from.x, y: height }; + break; + case 'untilLeft': + to = { x: 0, y: from.y }; + break; + case 'untilRight': + to = { x: width, y: from.y }; + break; + default: throw new Error(`Unknown swipeType: ${swipeType}`); + } + const clamp = (v: number, a: number, b: number) => Math.max(a, Math.min(b, v)); + to.x = clamp(to.x, 0, width); + to.y = clamp(to.y, 0, height); + + await this.page.swipe(from, to, duration); + }, + }; + tasks.push(taskActionSwipe); + } else if (plan.type === 'LongPress') { + const taskActionLongPress: ExecutionTaskActionApply = { + type: 'Action', + subType: 'LongPress', + param: plan.param, + thought: plan.thought, + locate: plan.locate, + executor: async (taskParam, { element }) => { + assert( + !isAndroidPage(this.page), + 'Cannot use long press on Android devices, use android long press action instead', + ); + assert(element, 'Element not found, cannot long press'); + const duration = taskParam?.duration; + await this.page.longPress(element.center[0], element.center[1], duration); + }, + }; + tasks.push(taskActionLongPress); } else if (plan.type === 'Sleep') { const taskActionSleep: ExecutionTaskActionApply = { diff --git a/packages/web-integration/src/common/ui-utils.ts b/packages/web-integration/src/common/ui-utils.ts index 5521ecebb..2346a3287 100644 --- a/packages/web-integration/src/common/ui-utils.ts +++ b/packages/web-integration/src/common/ui-utils.ts @@ -8,6 +8,7 @@ import type { ExecutionTaskPlanning, PlanningActionParamAndroidPull, PlanningActionParamScroll, + PlanningActionParamSwipe, } from '@midscene/core'; export function typeStr(task: ExecutionTask) { @@ -60,6 +61,27 @@ export function scrollParamStr(scrollParam?: PlanningActionParamScroll) { return `${scrollParam.direction || 'down'}, ${scrollParam.scrollType || 'once'}, ${scrollParam.distance || 'distance-not-set'}`; } +export function swipeParamStr(swipeParam?: PlanningActionParamSwipe) { + if (!swipeParam) { + return ''; + } + const parts: string[] = []; + parts.push(`direction: ${swipeParam.direction || 'down'}`); + if (swipeParam.from) { + parts.push(`start: (${swipeParam.from.x}, ${swipeParam.from.y})`); + } + if (swipeParam.to) { + parts.push(`to: (${swipeParam.to.x}, ${swipeParam.to.y})`); + } + if (swipeParam.distance) { + parts.push(`distance: ${swipeParam.distance}`); + } + if (swipeParam.duration) { + parts.push(`duration: ${swipeParam.duration}ms`); + } + return parts.join(', '); +} + export function pullParamStr(pullParam?: PlanningActionParamAndroidPull) { if (!pullParam) { return ''; @@ -93,7 +115,9 @@ export function taskTitleStr( | 'Locate' | 'Boolean' | 'Number' - | 'String', + | 'String' + | 'LongPress' + | 'Swipe', prompt: string, ) { if (prompt) { diff --git a/packages/web-integration/src/playground/static-page.ts b/packages/web-integration/src/playground/static-page.ts index 4071da993..f6fc8e982 100644 --- a/packages/web-integration/src/playground/static-page.ts +++ b/packages/web-integration/src/playground/static-page.ts @@ -123,4 +123,10 @@ export default class StaticPage implements AbstractPage { async destroy(): Promise { // } + async longPress(): Promise { + // + } + async swipe(): Promise { + // + } } diff --git a/packages/web-integration/src/playwright/ai-fixture.ts b/packages/web-integration/src/playwright/ai-fixture.ts index 6d6c3551e..4ff087159 100644 --- a/packages/web-integration/src/playwright/ai-fixture.ts +++ b/packages/web-integration/src/playwright/ai-fixture.ts @@ -99,6 +99,8 @@ export const PlaywrightAiFixture = (options?: { | 'aiInput' | 'aiKeyboardPress' | 'aiScroll' + | 'aiLongPress' + | 'aiSwipe' | 'aiTap' | 'aiRightClick' | 'aiQuery' @@ -283,6 +285,30 @@ export const PlaywrightAiFixture = (options?: { aiActionType: 'aiScroll', }); }, + aiLongPress: async ( + { page }: { page: OriginPlaywrightPage }, + use: any, + testInfo: TestInfo, + ) => { + await generateAiFunction({ + page, + testInfo, + use, + aiActionType: 'aiLongPress', + }); + }, + aiSwipe: async ( + { page }: { page: OriginPlaywrightPage }, + use: any, + testInfo: TestInfo, + ) => { + await generateAiFunction({ + page, + testInfo, + use, + aiActionType: 'aiSwipe', + }); + }, aiQuery: async ( { page }: { page: OriginPlaywrightPage }, use: any, @@ -476,6 +502,12 @@ export type PlayWrightAiFixtureType = { aiScroll: ( ...args: Parameters ) => ReturnType; + aiLongPress: ( + ...args: Parameters + ) => ReturnType; + aiSwipe: ( + ...args: Parameters + ) => ReturnType; aiQuery: ( ...args: Parameters ) => ReturnType; diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index 13f5a2431..a8cd3026a 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -392,4 +392,84 @@ export class Page< } async destroy(): Promise {} + + async swipe ( + from: { x: number; y: number }, + to: { x: number; y: number }, + duration?: number, + ) { + const LONG_PRESS_THRESHOLD = 500; + const MIN_PRESS_THRESHOLD = 150; + duration = duration || 300; + if (duration < MIN_PRESS_THRESHOLD) { + duration = MIN_PRESS_THRESHOLD; + } + if (duration > LONG_PRESS_THRESHOLD) { + duration = LONG_PRESS_THRESHOLD; + } + debugPage( + `mouse swipe from ${from.x}, ${from.y} to ${to.x}, ${to.y} with delay ${duration}ms`, + ); + + if (this.pageType === 'puppeteer') { + const page = this.underlyingPage as PuppeteerPage; + await page.mouse.move(from.x, from.y); + await page.mouse.down(); + + const steps = 30; + const delay = duration / steps; + for (let i = 1; i <= steps; i++) { + const x = from.x + (to.x - from.x) * (i / steps); + const y = from.y + (to.y - from.y) * (i / steps); + await page.mouse.move(x, y); + await new Promise(resolve => setTimeout(resolve, delay)); + } + + await page.mouse.up(); + } else if (this.pageType === 'playwright') { + const page = this.underlyingPage as PlaywrightPage; + await page.mouse.move(from.x, from.y); + await page.mouse.down(); + + const steps = 30; + const delay = duration / steps; + for (let i = 1; i <= steps; i++) { + const x = from.x + (to.x - from.x) * (i / steps); + const y = from.y + (to.y - from.y) * (i / steps); + await page.mouse.move(x, y); + await page.waitForTimeout(delay); + } + + await page.mouse.up(); + } + }; + async longPress ( + x: number, + y: number, + duration?: number, + ) { + duration = duration || 500; + const LONG_PRESS_THRESHOLD = 600; + const MIN_PRESS_THRESHOLD = 300; + if (duration > LONG_PRESS_THRESHOLD) { + duration = LONG_PRESS_THRESHOLD; + } + if (duration < MIN_PRESS_THRESHOLD) { + duration = MIN_PRESS_THRESHOLD; + } + debugPage(`mouse longPress at ${x}, ${y} for ${duration}ms`); + if (this.pageType === 'puppeteer') { + const page = this.underlyingPage as PuppeteerPage; + await page.mouse.move(x, y); + await page.mouse.down({ button: 'left' }); + await new Promise(res => setTimeout(res, duration)); + await page.mouse.up({ button: 'left' }); + } else if (this.pageType === 'playwright') { + const page = this.underlyingPage as PlaywrightPage; + await page.mouse.move(x, y); + await page.mouse.down({ button: 'left' }); + await page.waitForTimeout(duration); + await page.mouse.up({ button: 'left' }); + } + } } diff --git a/packages/web-integration/src/yaml/player.ts b/packages/web-integration/src/yaml/player.ts index d51251be4..d387cd26c 100644 --- a/packages/web-integration/src/yaml/player.ts +++ b/packages/web-integration/src/yaml/player.ts @@ -19,6 +19,8 @@ import type { MidsceneYamlFlowItemAIScroll, MidsceneYamlFlowItemAIString, MidsceneYamlFlowItemAITap, + MidsceneYamlFlowItemAILongPress, + MidsceneYamlFlowItemAISwipe, MidsceneYamlFlowItemAIWaitFor, MidsceneYamlFlowItemEvaluateJavaScript, MidsceneYamlFlowItemLogScreenshot, @@ -318,6 +320,12 @@ export class ScriptPlayer { } else if ('aiScroll' in (flowItem as MidsceneYamlFlowItemAIScroll)) { const scrollTask = flowItem as MidsceneYamlFlowItemAIScroll; await agent.aiScroll(scrollTask, scrollTask.locate, scrollTask); + } else if ('aiLongPress' in (flowItem as MidsceneYamlFlowItemAILongPress)) { + const longPressTask = flowItem as MidsceneYamlFlowItemAILongPress; + await agent.aiLongPress(longPressTask.aiLongPress, longPressTask, longPressTask); + } else if ('aiSwipe' in (flowItem as MidsceneYamlFlowItemAISwipe)) { + const swipeTask = flowItem as MidsceneYamlFlowItemAISwipe; + await agent.aiSwipe(swipeTask, swipeTask.locate, swipeTask); } else if ( 'javascript' in (flowItem as MidsceneYamlFlowItemEvaluateJavaScript) ) { diff --git a/packages/web-integration/tests/ai/web/puppeteer/e2e.test.ts b/packages/web-integration/tests/ai/web/puppeteer/e2e.test.ts index 4dbcd6fc8..a4c7e5b88 100644 --- a/packages/web-integration/tests/ai/web/puppeteer/e2e.test.ts +++ b/packages/web-integration/tests/ai/web/puppeteer/e2e.test.ts @@ -278,6 +278,33 @@ describe( 'Type "AI 101" in search box, hit Enter, wait 2s. If there is a cookie prompt, close it', ); }); + + it('swipe', async () => { + const { originPage, reset } = await launchPage('https://m.baidu.com/s?word=%E5%A4%A7%E4%BC%97%E8%BD%A6%E5%9E%8Bid4', { + viewport: { + width: 393, + height: 808, + }, + }); + resetFn = reset; + const agent = new PuppeteerAgent(originPage); + await agent.aiAction( + '第一屏幕的中心区域向右滑动一屏', + ); + }); + it('longPress', async () => { + const { originPage, reset } = await launchPage('https://m.baidu.com/s?sid=505&word=6003828251-510eacba84f6ecf6d107bd82a763cd9a', { + viewport: { + width: 393, + height: 808, + }, + }); + resetFn = reset; + const agent = new PuppeteerAgent(originPage); + await agent.aiAction( + '长按进入新空间按钮', + ); + }); }, 4 * 60 * 1000, );