diff --git a/.husky/commit-msg b/.husky/commit-msg old mode 100644 new mode 100755 index e9982a6d9..b054e965d --- a/.husky/commit-msg +++ b/.husky/commit-msg @@ -1 +1,6 @@ -npx --no -- commitlint --edit "$1" \ No newline at end of file +#!/bin/sh + +# Ensure node is in PATH (for fnm users) +export PATH="$HOME/Library/Application Support/fnm/aliases/default/bin:$PATH" + +npx --no -- commitlint --edit "$1" diff --git a/apps/site/docs/en/integrate-with-android.mdx b/apps/site/docs/en/integrate-with-android.mdx index 80472a79c..589eb4a76 100644 --- a/apps/site/docs/en/integrate-with-android.mdx +++ b/apps/site/docs/en/integrate-with-android.mdx @@ -128,6 +128,7 @@ The AndroidDevice constructor supports the following parameters: - `remoteAdbPort?: number` - Optional, the remote adb port. - `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - Optional, when should Midscene invoke [yadb](https://github.com/ysbing/YADB) to input texts. (Default: 'always-yadb') - `displayId?: number` - Optional, the display id to use. (Default: undefined, means use the current display) + - `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is `1 / devicePixelRatio` (automatically scaled based on device pixel ratio). ### Additional Android Agent Interfaces diff --git a/apps/site/docs/en/integrate-with-ios.mdx b/apps/site/docs/en/integrate-with-ios.mdx index a704e2c97..0cd5d5e38 100644 --- a/apps/site/docs/en/integrate-with-ios.mdx +++ b/apps/site/docs/en/integrate-with-ios.mdx @@ -151,6 +151,7 @@ The IOSDevice constructor supports the following parameters: - `wdaPort?: number` - Optional, WebDriverAgent port. Default is 8100. - `wdaHost?: string` - Optional, WebDriverAgent host. Default is 'localhost'. - `autoDismissKeyboard?: boolean` - Optional, whether to automatically dismiss keyboard after text input. Default is true. + - `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is 1 (no scaling). - `customActions?: DeviceAction[]` - Optional, list of custom device actions. ### Additional iOS Agent Interfaces diff --git a/apps/site/docs/zh/integrate-with-android.mdx b/apps/site/docs/zh/integrate-with-android.mdx index 2800c75ca..4b94bd697 100644 --- a/apps/site/docs/zh/integrate-with-android.mdx +++ b/apps/site/docs/zh/integrate-with-android.mdx @@ -127,6 +127,7 @@ AndroidDevice 的构造函数支持以下参数: - `remoteAdbPort?: number` - 可选参数,用于指定远程 adb 端口。 - `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - 可选参数,控制 Midscene 何时调用 [yadb](https://github.com/ysbing/YADB) 来输入文本。默认值为 'always-yadb'。 - `displayId?: number` - 可选参数,用于指定要使用的显示器 ID。默认值为 undefined,表示使用当前显示器。 + - `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 `1 / devicePixelRatio`(根据设备像素比自动缩放)。 ### Android Agent 上的更多接口 diff --git a/apps/site/docs/zh/integrate-with-ios.mdx b/apps/site/docs/zh/integrate-with-ios.mdx index 25cdae85f..9c3f04ad7 100644 --- a/apps/site/docs/zh/integrate-with-ios.mdx +++ b/apps/site/docs/zh/integrate-with-ios.mdx @@ -199,6 +199,7 @@ IOSDevice 的构造函数支持以下参数: - `wdaPort?: number` - 可选参数,WebDriverAgent 端口。默认值为 8100。 - `wdaHost?: string` - 可选参数,WebDriverAgent 主机。默认值为 'localhost'。 - `autoDismissKeyboard?: boolean` - 可选参数,是否在输入文本后自动关闭键盘。默认值为 true。 + - `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 1(不缩放)。 - `customActions?: DeviceAction[]` - 可选参数,自定义设备动作列表。 ### iOS Agent 上的更多接口 diff --git a/packages/android/src/agent.ts b/packages/android/src/agent.ts index 4c2c3bf48..76d0390e6 100644 --- a/packages/android/src/agent.ts +++ b/packages/android/src/agent.ts @@ -44,6 +44,7 @@ export async function agentFromAdbDevice( usePhysicalDisplayIdForScreenshot: opts?.usePhysicalDisplayIdForScreenshot, usePhysicalDisplayIdForDisplayLookup: opts?.usePhysicalDisplayIdForDisplayLookup, + screenshotResizeScale: opts?.screenshotResizeScale, }); await device.connect(); diff --git a/packages/android/src/device.ts b/packages/android/src/device.ts index 30762e077..ce5d1fb16 100644 --- a/packages/android/src/device.ts +++ b/packages/android/src/device.ts @@ -60,6 +60,7 @@ export type AndroidDeviceOpt = { usePhysicalDisplayIdForScreenshot?: boolean; usePhysicalDisplayIdForDisplayLookup?: boolean; customActions?: DeviceAction[]; + screenshotResizeScale?: number; } & AndroidDeviceInputOpt; export class AndroidDevice implements AbstractInterface { @@ -67,6 +68,7 @@ export class AndroidDevice implements AbstractInterface { private yadbPushed = false; private devicePixelRatio = 1; private devicePixelRatioInitialized = false; + private scalingRatio = 1; // Record scaling ratio for coordinate adjustment private adb: ADB | null = null; private connectingAdb: Promise | null = null; private destroyed = false; @@ -713,25 +715,28 @@ ${Object.keys(size) const width = Number.parseInt(match[isLandscape ? 2 : 1], 10); const height = Number.parseInt(match[isLandscape ? 1 : 2], 10); - // Use cached device pixel ratio instead of calling getDisplayDensity() every time + // Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio + // Default is 1/dpr to scale down by device pixel ratio (e.g., dpr=3 -> scale=1/3) + const scale = + this.options?.screenshotResizeScale ?? 1 / this.devicePixelRatio; + this.scalingRatio = scale; - // Convert physical pixels to logical pixels for consistent coordinate system + // Apply scale to get logical dimensions for AI processing // adjustCoordinates() will convert back to physical pixels when needed for touch operations - const logicalWidth = Math.round(width / this.devicePixelRatio); - const logicalHeight = Math.round(height / this.devicePixelRatio); + const logicalWidth = Math.round(width * scale); + const logicalHeight = Math.round(height * scale); return { width: logicalWidth, height: logicalHeight, - dpr: this.devicePixelRatio, }; } private adjustCoordinates(x: number, y: number): { x: number; y: number } { - const ratio = this.devicePixelRatio; + const scale = this.scalingRatio; return { - x: Math.round(x * ratio), - y: Math.round(y * ratio), + x: Math.round(x / scale), + y: Math.round(y / scale), }; } diff --git a/packages/android/tests/unit-test/page.test.ts b/packages/android/tests/unit-test/page.test.ts index 93551fa24..84f37ba14 100644 --- a/packages/android/tests/unit-test/page.test.ts +++ b/packages/android/tests/unit-test/page.test.ts @@ -125,7 +125,7 @@ describe('AndroidDevice', () => { const size1 = await device.size(); const size2 = await device.size(); - expect(size1).toEqual({ width: 540, height: 960, dpr: 2 }); + expect(size1).toEqual({ width: 540, height: 960 }); expect(size2).toEqual(size1); // Caching is removed, so it should be called twice expect(vi.spyOn(device as any, 'getScreenSize')).toHaveBeenCalledTimes(2); @@ -1343,7 +1343,7 @@ describe('AndroidDevice', () => { expect(mockAdbInstance.shell).toHaveBeenCalledWith('dumpsys display'); expect(size.width).toBe(411); // 1080 / (420/160) ≈ 411 expect(size.height).toBe(731); // 1920 / (420/160) ≈ 731 - expect(size.dpr).toBe(2.625); // 420 / 160 = 2.625 + // dpr is no longer returned in size() }); it('should use display ID for screenshots by default when displayId is set', async () => { diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index f874e8508..765e04968 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,6 +48,7 @@ import { globalConfigManager, globalModelConfigManager, } from '@midscene/shared/env'; +import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; // import type { AndroidDeviceInputOpt } from '../device'; @@ -134,6 +135,16 @@ export class Agent< */ private hasWarnedNonVLModel = false; + /** + * Screenshot scale factor derived from actual screenshot dimensions + */ + private screenshotScale?: number; + + /** + * Internal promise to deduplicate screenshot scale computation + */ + private screenshotScalePromise?: Promise; + // @deprecated use .interface instead get page() { return this.interface; @@ -155,6 +166,52 @@ export class Agent< } } + /** + * Lazily compute the ratio between the physical screenshot width and the logical page width + */ + private async getScreenshotScale(context: UIContext): Promise { + if (this.screenshotScale !== undefined) { + return this.screenshotScale; + } + + if (!this.screenshotScalePromise) { + this.screenshotScalePromise = (async () => { + const pageWidth = context.size?.width; + assert( + pageWidth && pageWidth > 0, + `Invalid page width when computing screenshot scale: ${pageWidth}`, + ); + + const { width: screenshotWidth } = await imageInfoOfBase64( + context.screenshotBase64, + ); + + assert( + Number.isFinite(screenshotWidth) && screenshotWidth > 0, + `Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`, + ); + + const computedScale = screenshotWidth / pageWidth; + assert( + Number.isFinite(computedScale) && computedScale > 0, + `Invalid computed screenshot scale: ${computedScale}`, + ); + + debug( + `Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`, + ); + return computedScale; + })(); + } + + try { + this.screenshotScale = await this.screenshotScalePromise; + return this.screenshotScale; + } finally { + this.screenshotScalePromise = undefined; + } + } + constructor(interfaceInstance: InterfaceType, opts?: AgentOpt) { this.interface = interfaceInstance; this.opts = Object.assign( @@ -218,15 +275,37 @@ export class Agent< return this.frozenUIContext; } + // Get original context + let context: UIContext; if (this.interface.getContext) { debug('Using page.getContext for action:', action); - return await this.interface.getContext(); + context = await this.interface.getContext(); } else { debug('Using commonContextParser for action:', action); - return await commonContextParser(this.interface, { + context = await commonContextParser(this.interface, { uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(), }); } + + const computedScreenshotScale = await this.getScreenshotScale(context); + + if (computedScreenshotScale !== 1) { + const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4)); + debug( + `Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`, + ); + const targetWidth = Math.round(context.size.width); + const targetHeight = Math.round(context.size.height); + debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`); + context.screenshotBase64 = await resizeImgBase64( + context.screenshotBase64, + { width: targetWidth, height: targetHeight }, + ); + } else { + debug(`screenshot scale=${computedScreenshotScale}`); + } + + return context; } async _snapshotContext(): Promise { @@ -827,12 +906,18 @@ export class Agent< const { element } = output; + const dprValue = await (this.interface.size() as any).dpr; + const dprEntry = dprValue + ? { + dpr: dprValue, + } + : {}; return { rect: element?.rect, center: element?.center, - scale: (await this.interface.size()).dpr, + ...dprEntry, } as Pick & { - scale: number; + dpr?: number; // this field is deprecated }; } diff --git a/packages/core/src/agent/utils.ts b/packages/core/src/agent/utils.ts index 821247d30..c2cc8a008 100644 --- a/packages/core/src/agent/utils.ts +++ b/packages/core/src/agent/utils.ts @@ -50,21 +50,12 @@ export async function commonContextParser( }); debugProfile('UploadTestInfoToServer end'); - let screenshotBase64 = await interfaceInstance.screenshotBase64(); + const screenshotBase64 = await interfaceInstance.screenshotBase64(); assert(screenshotBase64!, 'screenshotBase64 is required'); const size = await interfaceInstance.size(); debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`); - if (size.dpr && size.dpr !== 1) { - debugProfile('Resizing screenshot for non-1 dpr'); - screenshotBase64 = await resizeImgBase64(screenshotBase64, { - width: size.width, - height: size.height, - }); - debugProfile('ResizeImgBase64 end'); - } - return { tree: { node: null, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index fa920e8b6..9a7e82b13 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -604,4 +604,6 @@ export interface AgentOpt { modelConfig?: TModelConfigFn; cache?: Cache; replanningCycleLimit?: number; + /* Screenshot scaling ratio to reduce image size sent to AI for better performance */ + screenshotScale?: number; } diff --git a/packages/ios/src/agent.ts b/packages/ios/src/agent.ts index 2b2bf5a99..65fe8022c 100644 --- a/packages/ios/src/agent.ts +++ b/packages/ios/src/agent.ts @@ -31,6 +31,7 @@ export async function agentFromWebDriverAgent( wdaPort: opts?.wdaPort, wdaHost: opts?.wdaHost, useWDA: opts?.useWDA, + screenshotResizeScale: opts?.screenshotResizeScale, }); await device.connect(); diff --git a/packages/ios/src/device.ts b/packages/ios/src/device.ts index c13146053..a40153bea 100644 --- a/packages/ios/src/device.ts +++ b/packages/ios/src/device.ts @@ -37,12 +37,14 @@ export type IOSDeviceOpt = { wdaPort?: number; wdaHost?: string; useWDA?: boolean; + screenshotResizeScale?: number; } & IOSDeviceInputOpt; export class IOSDevice implements AbstractInterface { private deviceId: string; private devicePixelRatio = 1; private devicePixelRatioInitialized = false; + private scalingRatio = 1; // Record scaling ratio for coordinate adjustment private destroyed = false; private description: string | undefined; private customActions?: DeviceAction[]; @@ -341,12 +343,35 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) } async size(): Promise { + // Ensure device pixel ratio is initialized first + await this.initializeDevicePixelRatio(); + const screenSize = await this.getScreenSize(); + // Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio + // Default is 1 + const scale = this.options?.screenshotResizeScale ?? 1; + this.scalingRatio = scale; + + // Apply scale to get logical dimensions for AI processing + const logicalWidth = Math.round(screenSize.width * scale); + const logicalHeight = Math.round(screenSize.height * scale); + + debugDevice( + `size() - screenSize: ${screenSize.width}x${screenSize.height}, scale: ${scale}, logicalSize: ${logicalWidth}x${logicalHeight}`, + ); + return { - width: screenSize.width, - height: screenSize.height, - dpr: screenSize.scale, + width: logicalWidth, + height: logicalHeight, + }; + } + + private adjustCoordinates(x: number, y: number): { x: number; y: number } { + const scale = this.scalingRatio; + return { + x: Math.round(x / scale), + y: Math.round(y / scale), }; } @@ -399,7 +424,11 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) // Core interaction methods async tap(x: number, y: number): Promise { - await this.wdaBackend.tap(x, y); + const adjusted = this.adjustCoordinates(x, y); + debugDevice( + `tap at coordinates - input: (${x}, ${y}), adjusted: (${adjusted.x}, ${adjusted.y}), scale: ${this.scalingRatio}`, + ); + await this.wdaBackend.tap(adjusted.x, adjusted.y); } // Android-compatible method name @@ -409,11 +438,13 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) } async doubleTap(x: number, y: number): Promise { - await this.wdaBackend.doubleTap(x, y); + const adjusted = this.adjustCoordinates(x, y); + await this.wdaBackend.doubleTap(adjusted.x, adjusted.y); } async longPress(x: number, y: number, duration = 1000): Promise { - await this.wdaBackend.longPress(x, y, duration); + const adjusted = this.adjustCoordinates(x, y); + await this.wdaBackend.longPress(adjusted.x, adjusted.y, duration); } async swipe( @@ -423,7 +454,15 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) toY: number, duration = 500, ): Promise { - await this.wdaBackend.swipe(fromX, fromY, toX, toY, duration); + const adjustedFrom = this.adjustCoordinates(fromX, fromY); + const adjustedTo = this.adjustCoordinates(toX, toY); + await this.wdaBackend.swipe( + adjustedFrom.x, + adjustedFrom.y, + adjustedTo.x, + adjustedTo.y, + duration, + ); } async typeText(text: string, options?: IOSDeviceInputOpt): Promise { diff --git a/packages/ios/tests/unit-test/device.test.ts b/packages/ios/tests/unit-test/device.test.ts index b827b11ee..4973097ce 100644 --- a/packages/ios/tests/unit-test/device.test.ts +++ b/packages/ios/tests/unit-test/device.test.ts @@ -186,7 +186,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); expect(mockWdaClient.getWindowSize).toHaveBeenCalled(); }); @@ -298,7 +297,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); }); @@ -472,11 +470,6 @@ describe('IOSDevice', () => { await device.connect(); }); - it('should calculate DPR correctly', async () => { - const size = await device.size(); - expect(size.dpr).toBe(2); // DPR from mocked getScreenScale - }); - it('should handle different screen sizes', async () => { mockWdaClient.getWindowSize = vi .fn() diff --git a/packages/shared/src/img/info.ts b/packages/shared/src/img/info.ts index 8575a3052..0045aca57 100644 --- a/packages/shared/src/img/info.ts +++ b/packages/shared/src/img/info.ts @@ -1,14 +1,9 @@ import assert from 'node:assert'; import { Buffer } from 'node:buffer'; import type Jimp from 'jimp'; +import type { Size } from '../types'; import getJimp from './get-jimp'; -export interface Size { - width: number; - height: number; - dpr?: number; -} - export interface ImageInfo extends Size { jimpImage: Jimp; } diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index 359bdd963..43655f390 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -7,9 +7,9 @@ export interface Point { } export interface Size { - width: number; // device independent window size - height: number; - dpr?: number; // the scale factor of the screenshots + width: number; // The image sent to AI model will be resized to this width, also the coordinates in the action space will be scaled to the range [0, width]. Usually you should set it to the logical pixel size + height: number; // The image sent to AI model will be resized to this height, also the coordinates in the action space will be scaled to the range [0, height]. Usually you should set it to the logical pixel size + dpr?: number; // this is deprecated, do NOT use it } export type Rect = Point & Size & { zoom?: number }; diff --git a/packages/web-integration/tests/unit-test/freeze-context.test.ts b/packages/web-integration/tests/unit-test/freeze-context.test.ts index d6d7e5d12..bb111082a 100644 --- a/packages/web-integration/tests/unit-test/freeze-context.test.ts +++ b/packages/web-integration/tests/unit-test/freeze-context.test.ts @@ -15,8 +15,8 @@ const mockPage = { evaluateJavaScript: vi.fn(), size: vi.fn().mockResolvedValue({ width: 1920, height: 1080, dpr: 1 }), url: vi.fn().mockResolvedValue('https://example.com'), - getContext: vi.fn().mockImplementation(async function () { - return await WebPageContextParser(this); + getContext: vi.fn().mockImplementation(async function (this: WebPage) { + return await WebPageContextParser(this, {}); }), } as unknown as WebPage; @@ -31,7 +31,8 @@ describe('PageAgent freeze/unfreeze page context', () => { // Create mock contexts mockContext = { size: { width: 1920, height: 1080, dpr: 1 }, - screenshotBase64: 'mock-screenshot-base64-1', + screenshotBase64: + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==', tree: [ { id: 'element1', @@ -46,7 +47,8 @@ describe('PageAgent freeze/unfreeze page context', () => { mockContext2 = { size: { width: 1920, height: 1080, dpr: 1 }, - screenshotBase64: 'mock-screenshot-base64-2', + screenshotBase64: + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==', tree: [ { id: 'element2', @@ -138,7 +140,9 @@ describe('PageAgent freeze/unfreeze page context', () => { // Frozen context should be marked const frozenContext = (agent as any).frozenUIContext; expect(frozenContext._isFrozen).toBe(true); - expect(frozenContext.screenshotBase64).toBe(mockContext.screenshotBase64); + expect(frozenContext.screenshotBase64).toBe( + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==', + ); expect(frozenContext.tree).toBe(mockContext.tree); });