diff --git a/apps/site/docs/en/index.mdx b/apps/site/docs/en/index.mdx index b968d662f..cc79d6745 100644 --- a/apps/site/docs/en/index.mdx +++ b/apps/site/docs/en/index.mdx @@ -108,7 +108,7 @@ There are so many UI automation tools out there, and each one seems to be all-po We would like to thank the following projects: -- [Rsbuild](https://github.com/web-infra-dev/rsbuild) for the build tool. +- [Rsbuild](https://github.com/web-infra-dev/rsbuild) and [Rslib](https://github.com/web-infra-dev/rslib) for the build tool. - [UI-TARS](https://github.com/bytedance/ui-tars) for the open-source agent model UI-TARS. - [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL) for the open-source VL model Qwen2.5-VL. - [scrcpy](https://github.com/Genymobile/scrcpy) and [yume-chan](https://github.com/yume-chan) allow us to control Android devices with browser. diff --git a/apps/site/docs/zh/index.mdx b/apps/site/docs/zh/index.mdx index af898fe86..e88ec84e4 100644 --- a/apps/site/docs/zh/index.mdx +++ b/apps/site/docs/zh/index.mdx @@ -105,7 +105,7 @@ for (const record of recordList) { 我们要感谢以下项目: -- [Rsbuild](https://github.com/web-infra-dev/rsbuild) 提供构建工具。 +- [Rsbuild](https://github.com/web-infra-dev/rsbuild) 和 [Rslib](https://github.com/web-infra-dev/rslib) 提供构建工具。 - [UI-TARS](https://github.com/bytedance/ui-tars) 提供开源智能体模型 UI-TARS。 - [Qwen2.5-VL](https://github.com/QwenLM/Qwen2.5-VL) 提供开源 VL 模型 Qwen2.5-VL。 - [scrcpy](https://github.com/Genymobile/scrcpy) 和 [yume-chan](https://github.com/yume-chan) 让我们能够用浏览器控制 Android 设备。 diff --git a/packages/android/src/page/index.ts b/packages/android/src/page/index.ts index 25b9258c9..f8dc8f562 100644 --- a/packages/android/src/page/index.ts +++ b/packages/android/src/page/index.ts @@ -4,7 +4,7 @@ import fs from 'node:fs'; import { createRequire } from 'node:module'; import path from 'node:path'; import { type Point, type Size, getAIConfig } from '@midscene/core'; -import type { DeviceAction, PageType } from '@midscene/core'; +import type { DeviceAction, ExecutorContext, PageType } from '@midscene/core'; import { getTmpFile, sleep } from '@midscene/core/utils'; import { MIDSCENE_ADB_PATH, @@ -16,11 +16,9 @@ import type { ElementInfo } from '@midscene/shared/extractor'; import { isValidPNGImageBuffer, resizeImg } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { repeat } from '@midscene/shared/utils'; -import { - type AndroidDeviceInputOpt, - type AndroidDevicePage, - commonWebActions, -} from '@midscene/web'; +import type { AndroidDeviceInputOpt, AndroidDevicePage } from '@midscene/web'; +import { commonWebActionsForWebPage } from '@midscene/web/utils'; + import { ADB } from 'appium-adb'; // only for Android, because it's impossible to scroll to the bottom, so we need to set a default scroll times @@ -36,51 +34,6 @@ export type AndroidDeviceOpt = { imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'; } & AndroidDeviceInputOpt; -const asyncNoop = async () => {}; -const androidActions: DeviceAction[] = [ - { - name: 'AndroidBackButton', - description: 'Trigger the system "back" operation on Android devices', - location: false, - call: asyncNoop, - }, - { - name: 'AndroidHomeButton', - description: 'Trigger the system "home" operation on Android devices', - location: false, - call: asyncNoop, - }, - { - name: 'AndroidRecentAppsButton', - description: - 'Trigger the system "recent apps" operation on Android devices', - location: false, - call: asyncNoop, - }, - { - name: 'AndroidLongPress', - description: - 'Trigger a long press on the screen at specified coordinates on Android devices', - paramSchema: '{ duration?: number }', - paramDescription: 'The duration of the long press', - location: 'optional', - whatToLocate: 'The element to be long pressed', - call: asyncNoop, - }, - { - name: 'AndroidPull', - description: - 'Trigger pull down to refresh or pull up actions on Android devices', - paramSchema: - '{ direction: "up" | "down", distance?: number, duration?: number }', - paramDescription: - 'The direction to pull, the distance to pull, and the duration of the pull.', - location: 'optional', - whatToLocate: 'The element to be pulled', - call: asyncNoop, - }, -]; - export class AndroidDevice implements AndroidDevicePage { private deviceId: string; private yadbPushed = false; @@ -93,7 +46,105 @@ export class AndroidDevice implements AndroidDevicePage { options?: AndroidDeviceOpt; actionSpace(): DeviceAction[] { - return commonWebActions.concat(androidActions); + const commonActions = commonWebActionsForWebPage(this); + commonActions.forEach((action) => { + if (action.name === 'Input') { + action.call = async (context, param) => { + const { element } = context; + if (element) { + await this.clearInput(element as unknown as ElementInfo); + + if (!param || !param.value) { + return; + } + } + + await this.keyboard.type(param.value, { + autoDismissKeyboard: this.options?.autoDismissKeyboard, + }); + }; + } + }); + + const allActions: DeviceAction[] = [ + ...commonWebActionsForWebPage(this), + { + name: 'AndroidBackButton', + description: 'Trigger the system "back" operation on Android devices', + location: false, + call: async (context, param) => { + await this.back(); + }, + }, + { + name: 'AndroidHomeButton', + description: 'Trigger the system "home" operation on Android devices', + location: false, + call: async (context, param) => { + await this.home(); + }, + }, + { + name: 'AndroidRecentAppsButton', + description: + 'Trigger the system "recent apps" operation on Android devices', + location: false, + call: async (context, param) => { + await this.recentApps(); + }, + }, + { + name: 'AndroidLongPress', + description: + 'Trigger a long press on the screen at specified coordinates on Android devices', + paramSchema: '{ duration?: number }', + paramDescription: 'The duration of the long press in milliseconds', + location: 'required', + whatToLocate: 'The element to be long pressed', + call: async (context, param) => { + const { element } = context; + if (!element) { + throw new Error( + 'AndroidLongPress requires an element to be located', + ); + } + const [x, y] = element.center; + await this.longPress(x, y, param?.duration); + }, + } as DeviceAction<{ duration?: number }>, + { + name: 'AndroidPull', + description: + 'Trigger pull down to refresh or pull up actions on Android devices', + paramSchema: + '{ direction: "up" | "down", distance?: number, duration?: number }', + paramDescription: + 'The direction to pull, the distance to pull (in pixels), and the duration of the pull (in milliseconds).', + location: 'optional', + whatToLocate: 'The element to be pulled', + call: async (context, param) => { + const { element } = context; + const startPoint = element + ? { left: element.center[0], top: element.center[1] } + : undefined; + if (!param || !param.direction) { + throw new Error('AndroidPull requires a direction parameter'); + } + if (param.direction === 'down') { + await this.pullDown(startPoint, param.distance, param.duration); + } else if (param.direction === 'up') { + await this.pullUp(startPoint, param.distance, param.duration); + } else { + throw new Error(`Unknown pull direction: ${param.direction}`); + } + }, + } as DeviceAction<{ + direction: 'up' | 'down'; + distance?: number; + duration?: number; + }>, + ]; + return allActions; } constructor(deviceId: string, options?: AndroidDeviceOpt) { @@ -472,11 +523,14 @@ ${Object.keys(size) get mouse() { return { click: (x: number, y: number) => this.mouseClick(x, y), - wheel: (deltaX: number, deltaY: number) => - this.mouseWheel(deltaX, deltaY), + wheel: (deltaX: number, deltaY: number, duration?: number) => + this.mouseWheel(deltaX, deltaY, duration), move: (x: number, y: number) => this.mouseMove(x, y), - drag: (from: { x: number; y: number }, to: { x: number; y: number }) => - this.mouseDrag(from, to), + drag: ( + from: { x: number; y: number }, + to: { x: number; y: number }, + duration?: number, + ) => this.mouseDrag(from, to, duration), }; } @@ -532,59 +586,74 @@ ${Object.keys(size) async scrollUntilTop(startPoint?: Point): Promise { if (startPoint) { + const { height } = await this.size(); const start = { x: startPoint.left, y: startPoint.top }; - const end = { x: start.x, y: 0 }; + const end = { x: start.x, y: height }; - await this.mouseDrag(start, end); + await repeat(defaultScrollUntilTimes, () => + this.mouseDrag(start, end, defaultFastScrollDuration), + ); + await sleep(1000); return; } await repeat(defaultScrollUntilTimes, () => - this.mouseWheel(0, 9999999, defaultFastScrollDuration), + this.mouseWheel(0, -9999999, defaultFastScrollDuration), ); await sleep(1000); } async scrollUntilBottom(startPoint?: Point): Promise { if (startPoint) { - const { height } = await this.size(); const start = { x: startPoint.left, y: startPoint.top }; - const end = { x: start.x, y: height }; - await this.mouseDrag(start, end); + const end = { x: start.x, y: 0 }; + + await repeat(defaultScrollUntilTimes, () => + this.mouseDrag(start, end, defaultFastScrollDuration), + ); + await sleep(1000); return; } await repeat(defaultScrollUntilTimes, () => - this.mouseWheel(0, -9999999, defaultFastScrollDuration), + this.mouseWheel(0, 9999999, defaultFastScrollDuration), ); await sleep(1000); } async scrollUntilLeft(startPoint?: Point): Promise { if (startPoint) { + const { width } = await this.size(); const start = { x: startPoint.left, y: startPoint.top }; - const end = { x: 0, y: start.y }; - await this.mouseDrag(start, end); + const end = { x: width, y: start.y }; + + await repeat(defaultScrollUntilTimes, () => + this.mouseDrag(start, end, defaultFastScrollDuration), + ); + await sleep(1000); return; } await repeat(defaultScrollUntilTimes, () => - this.mouseWheel(9999999, 0, defaultFastScrollDuration), + this.mouseWheel(-9999999, 0, defaultFastScrollDuration), ); await sleep(1000); } async scrollUntilRight(startPoint?: Point): Promise { if (startPoint) { - const { width } = await this.size(); const start = { x: startPoint.left, y: startPoint.top }; - const end = { x: width, y: start.y }; - await this.mouseDrag(start, end); + const end = { x: 0, y: start.y }; + + await repeat(defaultScrollUntilTimes, () => + this.mouseDrag(start, end, defaultFastScrollDuration), + ); + await sleep(1000); return; } await repeat(defaultScrollUntilTimes, () => - this.mouseWheel(-9999999, 0, defaultFastScrollDuration), + this.mouseWheel(9999999, 0, defaultFastScrollDuration), ); await sleep(1000); } @@ -595,13 +664,13 @@ ${Object.keys(size) if (startPoint) { const start = { x: startPoint.left, y: startPoint.top }; - const endY = Math.max(0, start.y - scrollDistance); + const endY = Math.min(height, start.y + scrollDistance); const end = { x: start.x, y: endY }; await this.mouseDrag(start, end); return; } - await this.mouseWheel(0, scrollDistance); + await this.mouseWheel(0, -scrollDistance); } async scrollDown(distance?: number, startPoint?: Point): Promise { @@ -610,13 +679,13 @@ ${Object.keys(size) if (startPoint) { const start = { x: startPoint.left, y: startPoint.top }; - const endY = Math.min(height, start.y + scrollDistance); + const endY = Math.max(0, start.y - scrollDistance); const end = { x: start.x, y: endY }; await this.mouseDrag(start, end); return; } - await this.mouseWheel(0, -scrollDistance); + await this.mouseWheel(0, scrollDistance); } async scrollLeft(distance?: number, startPoint?: Point): Promise { @@ -625,13 +694,13 @@ ${Object.keys(size) if (startPoint) { const start = { x: startPoint.left, y: startPoint.top }; - const endX = Math.max(0, start.x - scrollDistance); + const endX = Math.min(width, start.x + scrollDistance); const end = { x: endX, y: start.y }; await this.mouseDrag(start, end); return; } - await this.mouseWheel(scrollDistance, 0); + await this.mouseWheel(-scrollDistance, 0); } async scrollRight(distance?: number, startPoint?: Point): Promise { @@ -640,13 +709,13 @@ ${Object.keys(size) if (startPoint) { const start = { x: startPoint.left, y: startPoint.top }; - const endX = Math.min(width, start.x + scrollDistance); + const endX = Math.max(0, start.x - scrollDistance); const end = { x: endX, y: start.y }; await this.mouseDrag(start, end); return; } - await this.mouseWheel(-scrollDistance, 0); + await this.mouseWheel(scrollDistance, 0); } private async ensureYadb() { @@ -757,6 +826,7 @@ ${Object.keys(size) private async mouseDrag( from: { x: number; y: number }, to: { x: number; y: number }, + duration?: number, ): Promise { const adb = await this.getAdb(); @@ -764,13 +834,18 @@ ${Object.keys(size) const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y); const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y); - await adb.shell(`input swipe ${fromX} ${fromY} ${toX} ${toY} 300`); + // Ensure duration has a default value + const swipeDuration = duration ?? 300; + + await adb.shell( + `input swipe ${fromX} ${fromY} ${toX} ${toY} ${swipeDuration}`, + ); } private async mouseWheel( deltaX: number, deltaY: number, - duration = defaultNormalScrollDuration, + duration?: number, ): Promise { const { width, height } = await this.size(); @@ -792,8 +867,11 @@ ${Object.keys(size) deltaY = Math.max(-maxNegativeDeltaY, Math.min(deltaY, maxPositiveDeltaY)); // Calculate the end coordinates - const endX = startX + deltaX; - const endY = startY + deltaY; + // Note: For swipe, we need to reverse the delta direction + // because positive deltaY should scroll up (show top content), + // which requires swiping from bottom to top (decreasing Y) + const endX = startX - deltaX; + const endY = startY - deltaY; // Adjust coordinates to fit device ratio const { x: adjustedStartX, y: adjustedStartY } = this.adjustCoordinates( @@ -807,9 +885,12 @@ ${Object.keys(size) const adb = await this.getAdb(); + // Ensure duration has a default value + const swipeDuration = duration ?? defaultNormalScrollDuration; + // Execute the swipe operation await adb.shell( - `input swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${duration}`, + `input swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${swipeDuration}`, ); } diff --git a/packages/android/tests/ai/setting.test.ts b/packages/android/tests/ai/setting.test.ts index e8a76d8f6..45e370858 100644 --- a/packages/android/tests/ai/setting.test.ts +++ b/packages/android/tests/ai/setting.test.ts @@ -16,13 +16,20 @@ describe( }); await agent.launch('com.android.settings/.Settings'); - + await agent.aiAction('pull down to refresh'); + await agent.aiAction('long press chat list first chat'); + await agent.aiAction('click recent apps button'); + await agent.aiAction('click android home button'); await agent.aiAction('scroll list to bottom'); await agent.aiAction('open "More settings"'); - await agent.aiAction('scroll list to bottom'); + await agent.aiAction('scroll left until left edge'); + await agent.aiAction('scroll right until right edge'); await agent.aiAction('scroll list to top'); - await agent.aiAction('swipe down one screen'); - await agent.aiAction('swipe up one screen'); + await agent.aiAction('scroll list to bottom'); + await agent.aiAction('scroll down one screen'); + await agent.aiAction('scroll up one screen'); + await agent.aiAction('scroll right one screen'); + await agent.aiAction('scroll left one screen'); }); }, 360 * 1000, diff --git a/packages/android/tests/unit-test/page.test.ts b/packages/android/tests/unit-test/page.test.ts index 090034b8f..a1c38e9d6 100644 --- a/packages/android/tests/unit-test/page.test.ts +++ b/packages/android/tests/unit-test/page.test.ts @@ -607,20 +607,20 @@ describe('AndroidDevice', () => { }); }); - it('scrollUp should call mouseWheel with positive Y delta', async () => { + it('scrollUp should call mouseWheel with negative Y delta', async () => { const wheelSpy = vi .spyOn(device as any, 'mouseWheel') .mockResolvedValue(undefined); await device.scrollUp(100); - expect(wheelSpy).toHaveBeenCalledWith(0, 100); + expect(wheelSpy).toHaveBeenCalledWith(0, -100); }); - it('scrollDown should call mouseWheel with negative Y delta', async () => { + it('scrollDown should call mouseWheel with positive Y delta', async () => { const wheelSpy = vi .spyOn(device as any, 'mouseWheel') .mockResolvedValue(undefined); await device.scrollDown(100); - expect(wheelSpy).toHaveBeenCalledWith(0, -100); + expect(wheelSpy).toHaveBeenCalledWith(0, 100); }); }); diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts index c2f4ce11b..5d3cb6d6b 100644 --- a/packages/core/src/ai-model/common.ts +++ b/packages/core/src/ai-model/common.ts @@ -5,9 +5,9 @@ import type { MidsceneYamlFlowItem, PlanningAction, PlanningActionParamInputOrKeyPress, - PlanningActionParamScroll, PlanningActionParamSleep, Rect, + ScrollParam, Size, } from '@/types'; import { assert } from '@midscene/shared/utils'; @@ -356,7 +356,7 @@ export function buildYamlFlowFromPlans( locate, }); } else if (type === 'Scroll') { - const param = plan.param as PlanningActionParamScroll; + const param = plan.param as ScrollParam; flow.push({ aiScroll: null, locate, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 2cd0b5142..b5af29eaa 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -8,11 +8,7 @@ import type { Size, } from '@midscene/shared/types'; import type { ChatCompletionMessageParam } from 'openai/resources/index'; -import type { - DetailedLocateParam, - MidsceneYamlFlowItem, - scrollParam, -} from './yaml'; +import type { DetailedLocateParam, MidsceneYamlFlowItem } from './yaml'; export type { ElementTreeNode, @@ -295,13 +291,12 @@ export interface PlanningAIResponse { export type PlanningActionParamTap = null; export type PlanningActionParamHover = null; export type PlanningActionParamRightClick = null; + export interface PlanningActionParamInputOrKeyPress { value: string; autoDismissKeyboard?: boolean; } -export type PlanningActionParamScroll = scrollParam; - export interface PlanningActionParamAssert { assertion: TUserPrompt; } @@ -318,15 +313,12 @@ export type PlanningActionParamWaitFor = AgentWaitForOpt & { assertion: string; }; -export interface PlanningActionParamAndroidLongPress { - x: number; - y: number; +export interface AndroidLongPressParam { duration?: number; } -export interface PlanningActionParamAndroidPull { +export interface AndroidPullParam { direction: 'up' | 'down'; - startPoint?: { x: number; y: number }; distance?: number; duration?: number; } @@ -623,5 +615,5 @@ export interface DeviceAction { paramDescription?: string; location?: 'required' | 'optional' | false; whatToLocate?: string; // what to locate if location is required or optional - call: (param: ParamType) => Promise | void; + call: (context: ExecutorContext, param: ParamType) => Promise | void; } diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index 8c3cf8134..0dae79353 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -1,4 +1,4 @@ -import type { PlanningActionParamScroll, Rect, TUserPrompt } from './types'; +import type { Rect, TUserPrompt } from './types'; import type { BaseElement, UIContext } from './types'; export interface LocateOption { @@ -24,7 +24,7 @@ export interface DetailedLocateParam extends LocateOption { referenceImage?: ReferenceImage; } -export interface scrollParam { +export interface ScrollParam { direction: 'down' | 'up' | 'right' | 'left'; scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft'; distance?: null | number; // distance in px @@ -157,7 +157,7 @@ export interface MidsceneYamlFlowItemAIKeyboardPress extends LocateOption { export interface MidsceneYamlFlowItemAIScroll extends LocateOption, - PlanningActionParamScroll { + ScrollParam { aiScroll: null; locate?: TUserPrompt; // which area to scroll, optional } diff --git a/packages/web-integration/src/bridge-mode/agent-cli-side.ts b/packages/web-integration/src/bridge-mode/agent-cli-side.ts index ae23c10d2..eb616ced9 100644 --- a/packages/web-integration/src/bridge-mode/agent-cli-side.ts +++ b/packages/web-integration/src/bridge-mode/agent-cli-side.ts @@ -1,5 +1,7 @@ import { PageAgent, type PageAgentOpt } from '@/common/agent'; +import { commonWebActionsForWebPage } from '@/common/utils'; import type { KeyboardAction, MouseAction } from '@/page'; +import type { DeviceAction, ExecutorContext } from '@midscene/core'; import { assert } from '@midscene/shared/utils'; import { type BridgeConnectTabOptions, @@ -44,7 +46,7 @@ export const getBridgePageInCliSide = ( }, }; - return new Proxy(page, { + const proxyPage = new Proxy(page, { get(target, prop, receiver) { assert(typeof prop === 'string', 'prop must be a string'); @@ -64,6 +66,10 @@ export const getBridgePageInCliSide = ( return undefined; } + if (prop === 'actionSpace') { + return () => commonWebActionsForWebPage(proxyPage); + } + if (Object.keys(page).includes(prop)) { return page[prop as keyof typeof page]; } @@ -101,6 +107,8 @@ export const getBridgePageInCliSide = ( return bridgeCaller(prop); }, }) as ChromeExtensionPageCliSide; + + return proxyPage; }; export class AgentOverChromeBridge extends PageAgent { diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts index ecae90a14..cd037e901 100644 --- a/packages/web-integration/src/chrome-extension/page.ts +++ b/packages/web-integration/src/chrome-extension/page.ts @@ -7,10 +7,12 @@ import type { WebKeyInput } from '@/common/page'; import { limitOpenNewTabScript } from '@/common/ui-utils'; -import { type AbstractPage, type MouseButton, commonWebActions } from '@/page'; +import { commonWebActionsForWebPage } from '@/common/utils'; +import type { AbstractPage, MouseButton } from '@/page'; import type { DeviceAction, ElementTreeNode, + ExecutorContext, Point, Size, } from '@midscene/core'; @@ -56,7 +58,7 @@ export default class ChromeExtensionProxyPage implements AbstractPage { } actionSpace(): DeviceAction[] { - return commonWebActions; + return commonWebActionsForWebPage(this); } public async setActiveTabId(tabId: number) { diff --git a/packages/web-integration/src/common/agent.ts b/packages/web-integration/src/common/agent.ts index 9fb32bc26..7b6212afa 100644 --- a/packages/web-integration/src/common/agent.ts +++ b/packages/web-integration/src/common/agent.ts @@ -20,8 +20,8 @@ import { type LocatorValidatorOption, type MidsceneYamlScript, type OnTaskStartTip, - type PlanningActionParamScroll, type Rect, + type ScrollParam, type TUserPrompt, } from '@midscene/core'; @@ -411,7 +411,7 @@ export class PageAgent { } async aiScroll( - scrollParam: PlanningActionParamScroll, + scrollParam: ScrollParam, locatePrompt?: TUserPrompt, opt?: LocateOption, ) { diff --git a/packages/web-integration/src/common/plan-builder.ts b/packages/web-integration/src/common/plan-builder.ts index ade992b7d..8ad45a959 100644 --- a/packages/web-integration/src/common/plan-builder.ts +++ b/packages/web-integration/src/common/plan-builder.ts @@ -1,12 +1,11 @@ import type { DetailedLocateParam, - MidsceneYamlFlowItem, PlanningAction, PlanningActionParamInputOrKeyPress, - PlanningActionParamScroll, PlanningActionParamSleep, PlanningActionParamTap, PlanningLocateParam, + ScrollParam, } from '@midscene/core'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; @@ -18,7 +17,7 @@ export function buildPlans( locateParam?: DetailedLocateParam, param?: | PlanningActionParamInputOrKeyPress - | PlanningActionParamScroll + | ScrollParam | PlanningActionParamSleep, ): PlanningAction[] { let returnPlans: PlanningAction[] = []; @@ -65,9 +64,9 @@ export function buildPlans( if (type === 'Scroll') { assert(param, `missing param for action "${type}"`); - const scrollPlan: PlanningAction = { + const scrollPlan: PlanningAction = { type, - param: param as PlanningActionParamScroll, + param: param as ScrollParam, thought: '', locate: locateParam, }; diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts index 555379f97..0afef1835 100644 --- a/packages/web-integration/src/common/tasks.ts +++ b/packages/web-integration/src/common/tasks.ts @@ -3,6 +3,7 @@ import type { PuppeteerWebPage } from '@/puppeteer'; import { type AIUsageInfo, type BaseElement, + type DeviceAction, type DumpSubscriber, type ExecutionRecorderItem, type ExecutionTaskActionApply, @@ -25,15 +26,9 @@ import { type PageType, type PlanningAIResponse, type PlanningAction, - type PlanningActionParamAndroidLongPress, - type PlanningActionParamAndroidPull, type PlanningActionParamAssert, type PlanningActionParamError, - type PlanningActionParamHover, - type PlanningActionParamInputOrKeyPress, - type PlanningActionParamScroll, type PlanningActionParamSleep, - type PlanningActionParamTap, type PlanningActionParamWaitFor, type TMultimodalPrompt, type TUserPrompt, @@ -52,12 +47,11 @@ import { MIDSCENE_REPLANNING_CYCLE_LIMIT, getAIConfigInNumber, } from '@midscene/shared/env'; -import type { ElementInfo } from '@midscene/shared/extractor'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; import type { WebElementInfo, WebUIContext } from '../web-element'; import type { TaskCache } from './task-cache'; -import { getKeyCommands, taskTitleStr } from './ui-utils'; +import { taskTitleStr } from './ui-utils'; import { matchElementFromCache, matchElementFromPlan, @@ -70,13 +64,9 @@ interface ExecutionResult { executor: Executor; } -const debug = getDebug('page-task-executor'); +const debug = getDebug('device-task-executor'); const defaultReplanningCycleLimit = 10; -const isAndroidPage = (page: WebPage): page is AndroidDevicePage => { - return page.pageType === 'android'; -}; - export class PageTaskExecutor { page: WebPage; @@ -213,7 +203,7 @@ export class PageTaskExecutor { }, ) { const tasks: ExecutionTaskApply[] = []; - plans.forEach((plan) => { + for (const plan of plans) { if (plan.type === 'Locate') { if ( plan.locate === null || @@ -221,7 +211,7 @@ export class PageTaskExecutor { plan.locate?.id === 'null' ) { // console.warn('Locate action with id is null, will be ignored'); - return; + continue; } const taskFind: ExecutionTaskInsightLocateApply = { type: 'Insight', @@ -447,74 +437,44 @@ export class PageTaskExecutor { }, }; tasks.push(taskAssert); - } else if (plan.type === 'Input') { - const taskActionInput: ExecutionTaskActionApply = + } else if (plan.type === 'Error') { + const taskActionError: ExecutionTaskActionApply = { type: 'Action', - subType: 'Input', + subType: 'Error', param: plan.param, - thought: plan.thought, + thought: plan.thought || plan.param?.thought, locate: plan.locate, - executor: async (taskParam, { element }) => { - if (element) { - await this.page.clearInput(element as unknown as ElementInfo); - - if (!taskParam || !taskParam.value) { - return; - } - } - - await this.page.keyboard.type(taskParam.value, { - autoDismissKeyboard: taskParam.autoDismissKeyboard, - }); + executor: async () => { + throw new Error( + plan?.thought || plan.param?.thought || 'error without thought', + ); }, }; - tasks.push(taskActionInput); - } else if (plan.type === 'KeyboardPress') { - const taskActionKeyboardPress: ExecutionTaskActionApply = + tasks.push(taskActionError); + } else if (plan.type === 'Finished') { + const taskActionFinished: ExecutionTaskActionApply = { + type: 'Action', + subType: 'Finished', + param: null, + thought: plan.thought, + locate: plan.locate, + executor: async (param) => {}, + }; + tasks.push(taskActionFinished); + } else if (plan.type === 'Sleep') { + const taskActionSleep: ExecutionTaskActionApply = { type: 'Action', - subType: 'KeyboardPress', + subType: 'Sleep', param: plan.param, thought: plan.thought, locate: plan.locate, executor: async (taskParam) => { - const keys = getKeyCommands(taskParam.value); - - await this.page.keyboard.press(keys); - }, - }; - tasks.push(taskActionKeyboardPress); - } else if (plan.type === 'Tap') { - const taskActionTap: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'Tap', - thought: plan.thought, - locate: plan.locate, - executor: async (param, { element }) => { - assert(element, 'Element not found, cannot tap'); - await this.page.mouse.click(element.center[0], element.center[1]); - }, - }; - tasks.push(taskActionTap); - } else if (plan.type === 'RightClick') { - const taskActionRightClick: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'RightClick', - thought: plan.thought, - locate: plan.locate, - executor: async (param, { element }) => { - assert(element, 'Element not found, cannot right click'); - await this.page.mouse.click( - element.center[0], - element.center[1], - { button: 'right' }, - ); + await sleep(taskParam?.timeMs || 3000); }, }; - tasks.push(taskActionRightClick); + tasks.push(taskActionSleep); } else if (plan.type === 'Drag') { const taskActionDrag: ExecutionTaskActionApply<{ start_box: { x: number; y: number }; @@ -534,228 +494,34 @@ export class PageTaskExecutor { }, }; tasks.push(taskActionDrag); - } else if (plan.type === 'Hover') { - const taskActionHover: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'Hover', - thought: plan.thought, - locate: plan.locate, - executor: async (param, { element }) => { - assert(element, 'Element not found, cannot hover'); - await this.page.mouse.move(element.center[0], element.center[1]); - }, - }; - tasks.push(taskActionHover); - } else if (plan.type === 'Scroll') { - const taskActionScroll: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'Scroll', - param: plan.param, - thought: plan.thought, - locate: plan.locate, - executor: async (taskParam, { element }) => { - const startingPoint = element - ? { - left: element.center[0], - top: element.center[1], - } - : undefined; - const scrollToEventName = taskParam?.scrollType; - if (scrollToEventName === 'untilTop') { - await this.page.scrollUntilTop(startingPoint); - } else if (scrollToEventName === 'untilBottom') { - await this.page.scrollUntilBottom(startingPoint); - } else if (scrollToEventName === 'untilRight') { - await this.page.scrollUntilRight(startingPoint); - } else if (scrollToEventName === 'untilLeft') { - await this.page.scrollUntilLeft(startingPoint); - } else if (scrollToEventName === 'once' || !scrollToEventName) { - if ( - taskParam?.direction === 'down' || - !taskParam || - !taskParam.direction - ) { - await this.page.scrollDown( - taskParam?.distance || undefined, - startingPoint, - ); - } else if (taskParam.direction === 'up') { - await this.page.scrollUp( - taskParam.distance || undefined, - startingPoint, - ); - } else if (taskParam.direction === 'left') { - await this.page.scrollLeft( - taskParam.distance || undefined, - startingPoint, - ); - } else if (taskParam.direction === 'right') { - await this.page.scrollRight( - taskParam.distance || undefined, - startingPoint, - ); - } else { - throw new Error( - `Unknown scroll direction: ${taskParam.direction}`, - ); - } - // until mouse event is done - await sleep(500); - } else { - throw new Error( - `Unknown scroll event type: ${scrollToEventName}, taskParam: ${JSON.stringify( - taskParam, - )}`, - ); - } - }, - }; - tasks.push(taskActionScroll); - } else if (plan.type === 'Sleep') { - const taskActionSleep: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'Sleep', - param: plan.param, - thought: plan.thought, - locate: plan.locate, - executor: async (taskParam) => { - await sleep(taskParam?.timeMs || 3000); - }, - }; - tasks.push(taskActionSleep); - } else if (plan.type === 'Error') { - const taskActionError: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'Error', - param: plan.param, - thought: plan.thought || plan.param?.thought, - locate: plan.locate, - executor: async () => { - throw new Error( - plan?.thought || plan.param?.thought || 'error without thought', - ); - }, - }; - tasks.push(taskActionError); - } else if (plan.type === 'Finished') { - const taskActionFinished: ExecutionTaskActionApply = { - type: 'Action', - subType: 'Finished', - param: null, - thought: plan.thought, - locate: plan.locate, - executor: async (param) => {}, - }; - tasks.push(taskActionFinished); - } else if (plan.type === 'AndroidHomeButton') { - const taskActionAndroidHomeButton: ExecutionTaskActionApply = { + } else { + const planType = plan.type; + const task: ExecutionTaskActionApply = { type: 'Action', - subType: 'AndroidHomeButton', - param: null, + subType: planType, thought: plan.thought, - locate: plan.locate, - executor: async (param) => { - // Check if the page has back method (Android devices) - assert( - isAndroidPage(this.page), - 'Cannot use home button on non-Android devices', + param: plan.param, + executor: async (param, context) => { + debug( + 'executing action', + planType, + param, + `context.element.center: ${context.element?.center}`, ); - await this.page.home(); - }, - }; - tasks.push(taskActionAndroidHomeButton); - } else if (plan.type === 'AndroidBackButton') { - const taskActionAndroidBackButton: ExecutionTaskActionApply = { - type: 'Action', - subType: 'AndroidBackButton', - param: null, - thought: plan.thought, - locate: plan.locate, - executor: async (param) => { - assert( - isAndroidPage(this.page), - 'Cannot use back button on non-Android devices', + const actionSpace = await this.page.actionSpace(); + const action = actionSpace.find( + (action) => action.name === planType, ); - await this.page.back(); + if (!action) { + throw new Error(`Action type '${planType}' not found`); + } + const actionFn = action.call.bind(this.page); + return await actionFn(context, param); }, }; - tasks.push(taskActionAndroidBackButton); - } else if (plan.type === 'AndroidRecentAppsButton') { - const taskActionAndroidRecentAppsButton: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'AndroidRecentAppsButton', - param: null, - thought: plan.thought, - locate: plan.locate, - executor: async (param) => { - assert( - isAndroidPage(this.page), - 'Cannot use recent apps button on non-Android devices', - ); - await this.page.recentApps(); - }, - }; - tasks.push(taskActionAndroidRecentAppsButton); - } else if (plan.type === 'AndroidLongPress') { - const taskActionAndroidLongPress: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'AndroidLongPress', - param: plan.param as PlanningActionParamAndroidLongPress, - thought: plan.thought, - locate: plan.locate, - executor: async (param) => { - assert( - isAndroidPage(this.page), - 'Cannot use long press on non-Android devices', - ); - const { x, y, duration } = param; - await this.page.longPress(x, y, duration); - }, - }; - tasks.push(taskActionAndroidLongPress); - } else if (plan.type === 'AndroidPull') { - const taskActionAndroidPull: ExecutionTaskActionApply = - { - type: 'Action', - subType: 'AndroidPull', - param: plan.param as PlanningActionParamAndroidPull, - thought: plan.thought, - locate: plan.locate, - executor: async (param) => { - assert( - isAndroidPage(this.page), - 'Cannot use pull action on non-Android devices', - ); - const { direction, startPoint, distance, duration } = param; - - const convertedStartPoint = startPoint - ? { left: startPoint.x, top: startPoint.y } - : undefined; - - if (direction === 'down') { - await this.page.pullDown( - convertedStartPoint, - distance, - duration, - ); - } else if (direction === 'up') { - await this.page.pullUp(convertedStartPoint, distance, duration); - } else { - throw new Error(`Unknown pull direction: ${direction}`); - } - }, - }; - tasks.push(taskActionAndroidPull); - } else { - throw new Error(`Unknown or unsupported task type: ${plan.type}`); + tasks.push(task); } - }); + } const wrappedTasks = tasks.map( (task: ExecutionTaskApply, index: number) => { @@ -852,7 +618,7 @@ export class PageTaskExecutor { ); const actionSpace = await this.page.actionSpace(); debug( - 'actionSpace for page', + 'actionSpace for page is:', actionSpace.map((action) => action.name).join(', '), ); assert(Array.isArray(actionSpace), 'actionSpace must be an array'); diff --git a/packages/web-integration/src/common/ui-utils.ts b/packages/web-integration/src/common/ui-utils.ts index 5521ecebb..47172dc03 100644 --- a/packages/web-integration/src/common/ui-utils.ts +++ b/packages/web-integration/src/common/ui-utils.ts @@ -1,4 +1,5 @@ import type { + AndroidPullParam, DetailedLocateParam, ExecutionTask, ExecutionTaskAction, @@ -6,8 +7,7 @@ import type { ExecutionTaskInsightLocate, ExecutionTaskInsightQuery, ExecutionTaskPlanning, - PlanningActionParamAndroidPull, - PlanningActionParamScroll, + ScrollParam, } from '@midscene/core'; export function typeStr(task: ExecutionTask) { @@ -53,22 +53,19 @@ export function locateParamStr(locate?: DetailedLocateParam) { : locate.prompt.prompt; } -export function scrollParamStr(scrollParam?: PlanningActionParamScroll) { +export function scrollParamStr(scrollParam?: ScrollParam) { if (!scrollParam) { return ''; } return `${scrollParam.direction || 'down'}, ${scrollParam.scrollType || 'once'}, ${scrollParam.distance || 'distance-not-set'}`; } -export function pullParamStr(pullParam?: PlanningActionParamAndroidPull) { +export function pullParamStr(pullParam?: AndroidPullParam) { if (!pullParam) { return ''; } const parts: string[] = []; parts.push(`direction: ${pullParam.direction || 'down'}`); - if (pullParam.startPoint) { - parts.push(`start: (${pullParam.startPoint.x}, ${pullParam.startPoint.y})`); - } if (pullParam.distance) { parts.push(`distance: ${pullParam.distance}`); } diff --git a/packages/web-integration/src/common/utils.ts b/packages/web-integration/src/common/utils.ts index f0310756b..a005e36ab 100644 --- a/packages/web-integration/src/common/utils.ts +++ b/packages/web-integration/src/common/utils.ts @@ -1,17 +1,20 @@ import type { StaticPage } from '@/playground'; import type { BaseElement, + DeviceAction, ElementTreeNode, ExecutionDump, ExecutionTask, + ExecutorContext, PlanningLocateParam, PlaywrightParserOpt, + ScrollParam, TMultimodalPrompt, TUserPrompt, UIContext, } from '@midscene/core'; import { elementByPositionWithElementInfo } from '@midscene/core/ai-model'; -import { uploadTestInfoToServer } from '@midscene/core/utils'; +import { sleep, uploadTestInfoToServer } from '@midscene/core/utils'; import { MIDSCENE_REPORT_TAG_NAME, getAIConfig } from '@midscene/shared/env'; import type { ElementInfo } from '@midscene/shared/extractor'; import { @@ -25,10 +28,12 @@ import { assert, logMsg, uuid } from '@midscene/shared/utils'; import dayjs from 'dayjs'; import type { Page as PlaywrightPage } from 'playwright'; import type { Page as PuppeteerPage } from 'puppeteer'; +import type { AbstractPage } from '../page'; import { WebElementInfo, type WebUIContext } from '../web-element'; import type { WebPage } from './page'; import { debug as cacheDebug } from './task-cache'; import type { PageTaskExecutor } from './tasks'; +import { getKeyCommands } from './ui-utils'; const debug = getDebug('tool:profile'); @@ -343,3 +348,124 @@ export const parsePrompt = ( : undefined, }; }; + +export const commonWebActionsForWebPage = ( + page: T, +): DeviceAction[] => [ + { + name: 'Tap', + description: 'Tap the element', + location: 'required', + call: async (context) => { + const { element } = context; + assert(element, 'Element not found, cannot tap'); + await page.mouse.click(element.center[0], element.center[1], { + button: 'left', + }); + }, + }, + { + name: 'RightClick', + description: 'Right click the element', + location: 'required', + call: async (context) => { + const { element } = context; + assert(element, 'Element not found, cannot right click'); + await page.mouse.click(element.center[0], element.center[1], { + button: 'right', + }); + }, + }, + { + name: 'Hover', + description: 'Move the mouse to the element', + location: 'required', + call: async (context) => { + const { element } = context; + assert(element, 'Element not found, cannot hover'); + await page.mouse.move(element.center[0], element.center[1]); + }, + }, + { + name: 'Input', + description: 'Replace the input field with a new value', + paramSchema: '{ value: string }', + paramDescription: + '`value` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.', + location: 'required', + whatToLocate: 'The input field to be filled', + call: async (context, param) => { + const { element } = context; + if (element) { + await page.clearInput(element as unknown as ElementInfo); + + if (!param || !param.value) { + return; + } + } + + // Note: there is another implementation in AndroidDevicePage, which is more complex + await page.keyboard.type(param.value); + }, + } as DeviceAction<{ value: string }>, + { + name: 'KeyboardPress', + description: 'Press a key', + paramSchema: '{ value: string }', + paramDescription: 'The key to be pressed', + location: false, + call: async (context, param) => { + const keys = getKeyCommands(param.value); + await page.keyboard.press(keys as any); // TODO: fix this type error + }, + } as DeviceAction<{ value: string }>, + { + name: 'Scroll', + description: 'Scroll the page or an element', + paramSchema: + '{ direction: "down"(default) | "up" | "right" | "left", scrollType: "once" (default) | "untilBottom" | "untilTop" | "untilRight" | "untilLeft", distance: number | null }', + paramDescription: + 'The direction to scroll, the scroll type, and the distance to scroll. The distance is the number of pixels to scroll. If not specified, use `down` direction, `once` scroll type, and `null` distance.', + location: 'optional', + whatToLocate: 'The element to be scrolled', + call: async (context, param) => { + const { element } = context; + const startingPoint = element + ? { + left: element.center[0], + top: element.center[1], + } + : undefined; + const scrollToEventName = param?.scrollType; + if (scrollToEventName === 'untilTop') { + await page.scrollUntilTop(startingPoint); + } else if (scrollToEventName === 'untilBottom') { + await page.scrollUntilBottom(startingPoint); + } else if (scrollToEventName === 'untilRight') { + await page.scrollUntilRight(startingPoint); + } else if (scrollToEventName === 'untilLeft') { + await page.scrollUntilLeft(startingPoint); + } else if (scrollToEventName === 'once' || !scrollToEventName) { + if (param?.direction === 'down' || !param || !param.direction) { + await page.scrollDown(param?.distance || undefined, startingPoint); + } else if (param.direction === 'up') { + await page.scrollUp(param.distance || undefined, startingPoint); + } else if (param.direction === 'left') { + await page.scrollLeft(param.distance || undefined, startingPoint); + } else if (param.direction === 'right') { + await page.scrollRight(param.distance || undefined, startingPoint); + } else { + throw new Error(`Unknown scroll direction: ${param.direction}`); + } + // until mouse event is done + await sleep(500); + } else { + throw new Error( + `Unknown scroll event type: ${scrollToEventName}, param: ${JSON.stringify( + param, + )}`, + ); + } + }, + } as DeviceAction, +]; diff --git a/packages/web-integration/src/index.ts b/packages/web-integration/src/index.ts index baddd8cca..94edd906d 100644 --- a/packages/web-integration/src/index.ts +++ b/packages/web-integration/src/index.ts @@ -6,7 +6,6 @@ export type { AndroidDeviceInputOpt, } from './common/page'; export type { AbstractPage } from './page'; -export { commonWebActions } from './page'; export type { WebUIContext } from './web-element'; export { PageAgent, type PageAgentOpt } from './common/agent'; diff --git a/packages/web-integration/src/page.ts b/packages/web-integration/src/page.ts index f23c4da1c..3bdcd1539 100644 --- a/packages/web-integration/src/page.ts +++ b/packages/web-integration/src/page.ts @@ -1,6 +1,15 @@ -import type { DeviceAction, Point, Size } from '@midscene/core'; +import type { + DeviceAction, + ExecutorContext, + Point, + ScrollParam, + Size, +} from '@midscene/core'; +import { sleep } from '@midscene/core/utils'; import type { ElementInfo, ElementNode } from '@midscene/shared/extractor'; +import { assert } from '@midscene/shared/utils'; import type { WebKeyInput } from './common/page'; +import { getKeyCommands } from './common/ui-utils'; import type { WebUIContext } from './web-element'; export type MouseButton = 'left' | 'right' | 'middle'; @@ -40,7 +49,7 @@ export abstract class AbstractPage { abstract url(): string | Promise; abstract screenshotBase64?(): Promise; abstract size(): Promise; - abstract actionSpace(): DeviceAction[]; + abstract actionSpace(): DeviceAction[] | Promise; get mouse(): MouseAction { return { @@ -78,7 +87,7 @@ export abstract class AbstractPage { abstract scrollUp(distance?: number, startingPoint?: Point): Promise; abstract scrollDown(distance?: number, startingPoint?: Point): Promise; abstract scrollLeft(distance?: number, startingPoint?: Point): Promise; - abstract scrollRight(distance?: number): Promise; + abstract scrollRight(distance?: number, startingPoint?: Point): Promise; abstract _forceUsePageContext?(): Promise; @@ -91,62 +100,3 @@ export abstract class AbstractPage { abstract evaluateJavaScript?(script: string): Promise; } - -const asyncNoop = async () => {}; -export const commonWebActions: DeviceAction[] = [ - { - name: 'Tap', - description: 'Tap the element', - location: 'required', - call: asyncNoop, - }, - { - name: 'RightClick', - description: 'Right click the element', - location: 'required', - call: asyncNoop, - }, - { - name: 'Hover', - description: 'Move the mouse to the element', - location: 'required', - call: asyncNoop, - }, - { - name: 'Input', - description: 'Replace the input field with a new value', - paramSchema: '{ value: string }', - paramDescription: - '`value` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.', - location: 'required', - whatToLocate: 'The input field to be filled', - call: asyncNoop, - }, - { - name: 'KeyboardPress', - description: 'Press a key', - paramSchema: '{ value: string }', - paramDescription: 'The key to be pressed', - location: false, - call: asyncNoop, - }, - { - name: 'Scroll', - description: 'Scroll the page or an element', - paramSchema: - '{ direction: "down"(default) | "up" | "right" | "left", scrollType: "once" (default) | "untilBottom" | "untilTop" | "untilRight" | "untilLeft", distance: number | null }', - paramDescription: - 'The direction to scroll, the scroll type, and the distance to scroll. The distance is the number of pixels to scroll. If not specified, use `down` direction, `once` scroll type, and `null` distance.', - location: 'optional', - whatToLocate: 'The element to be scrolled', - call: asyncNoop, - }, - { - name: 'Sleep', - description: 'Sleep for a period of time', - paramSchema: '{ timeMs: number }', - paramDescription: 'The duration of the sleep in milliseconds', - location: false, - call: asyncNoop, - }, -]; diff --git a/packages/web-integration/src/playground/static-page.ts b/packages/web-integration/src/playground/static-page.ts index acdc07c74..9fca07b68 100644 --- a/packages/web-integration/src/playground/static-page.ts +++ b/packages/web-integration/src/playground/static-page.ts @@ -1,6 +1,6 @@ import { ERROR_CODE_NOT_IMPLEMENTED_AS_DESIGNED } from '@/common/utils'; import type { AbstractPage } from '@/page'; -import type { DeviceAction, Point } from '@midscene/core'; +import type { DeviceAction, ExecutorContext, Point } from '@midscene/core'; import type { WebUIContext } from '../web-element'; const ThrowNotImplemented: any = (methodName: string) => { diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index eed50e2bf..ac0fa3c6a 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -1,6 +1,7 @@ import type { DeviceAction, ElementTreeNode, + ExecutorContext, Point, Size, } from '@midscene/core'; @@ -17,8 +18,8 @@ import { assert } from '@midscene/shared/utils'; import type { Page as PlaywrightPage } from 'playwright'; import type { Page as PuppeteerPage } from 'puppeteer'; import type { WebKeyInput } from '../common/page'; -import { type AbstractPage, commonWebActions } from '../page'; -import type { MouseButton } from '../page'; +import { commonWebActionsForWebPage } from '../common/utils'; +import type { AbstractPage, MouseButton } from '../page'; export const debugPage = getDebug('web:page'); @@ -34,7 +35,7 @@ export class Page< pageType: AgentType; actionSpace(): DeviceAction[] { - return commonWebActions; + return commonWebActionsForWebPage(this); } private async evaluate( diff --git a/packages/web-integration/tests/ai/bridge/agent.test.ts b/packages/web-integration/tests/ai/bridge/agent.test.ts index eefc41d26..2a2b63f01 100644 --- a/packages/web-integration/tests/ai/bridge/agent.test.ts +++ b/packages/web-integration/tests/ai/bridge/agent.test.ts @@ -8,12 +8,11 @@ vi.setConfig({ testTimeout: 60 * 1000, }); const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); -const describeIf = process.env.BRIDGE_MODE ? describe : describe.skip; -describeIf( +describe.skipIf(!process.env.BRIDGE_MODE)( 'fully functional agent in server(cli) side', { - timeout: 3 * 60 * 10, + timeout: 3 * 60 * 1000, }, () => { it('basic', async () => { diff --git a/packages/web-integration/tests/ai/bridge/open-new-tab.test.ts b/packages/web-integration/tests/ai/bridge/open-new-tab.test.ts index 0f867fdf0..11d8e8b10 100644 --- a/packages/web-integration/tests/ai/bridge/open-new-tab.test.ts +++ b/packages/web-integration/tests/ai/bridge/open-new-tab.test.ts @@ -1,9 +1,6 @@ -import { - AgentOverChromeBridge, - getBridgePageInCliSide, -} from '@/bridge-mode/agent-cli-side'; +import { AgentOverChromeBridge } from '@/bridge-mode/agent-cli-side'; import { sleep } from '@midscene/core/utils'; -import { describe, expect, it, test, vi } from 'vitest'; +import { describe, it, vi } from 'vitest'; vi.setConfig({ testTimeout: 300 * 1000, diff --git a/packages/web-integration/tests/unit-test/page-task-executor-rightclick.test.ts b/packages/web-integration/tests/unit-test/page-task-executor-rightclick.test.ts index 0557002cb..ae45b3862 100644 --- a/packages/web-integration/tests/unit-test/page-task-executor-rightclick.test.ts +++ b/packages/web-integration/tests/unit-test/page-task-executor-rightclick.test.ts @@ -1,5 +1,5 @@ import { PageTaskExecutor } from '@/common/tasks'; -import type { PlanningAction } from '@midscene/core'; +import type { DeviceAction, PlanningAction } from '@midscene/core'; import { beforeEach, describe, expect, it, vi } from 'vitest'; // Mock page with mouse operations @@ -10,6 +10,22 @@ const mockPage = { }, screenshotBase64: vi.fn().mockResolvedValue('mock-screenshot'), evaluateJavaScript: vi.fn(), + actionSpace: () => + [ + { + name: 'RightClick', + call: (context, param) => { + if (!context.element) { + throw new Error('Element not found'); + } + mockPage.mouse.click( + context.element.center[0], + context.element.center[1], + { button: 'right' }, + ); + }, + }, + ] as DeviceAction[], } as any; // Mock insight @@ -165,8 +181,6 @@ describe('PageTaskExecutor RightClick Action', () => { }; // Should throw error when element is null - await expect(rightClickTask.executor(null, mockContext)).rejects.toThrow( - 'Element not found, cannot right click', - ); + await expect(rightClickTask.executor(null, mockContext)).rejects.toThrow(); }); });