From 397fa2be3d30eef1c4a8d2564eaefae5853b968e Mon Sep 17 00:00:00 2001 From: quanruzhuoxiu Date: Tue, 30 Sep 2025 14:06:07 +0800 Subject: [PATCH 1/7] feat(core): add screenshot scaling support for improved AI processing --- packages/android/src/device.ts | 56 ++++++------------------------ packages/core/src/agent/agent.ts | 48 ++++++++++++++++++++++--- packages/core/src/agent/utils.ts | 11 +----- packages/core/src/types.ts | 2 ++ packages/shared/src/img/info.ts | 7 +--- packages/shared/src/types/index.ts | 4 +-- 6 files changed, 60 insertions(+), 68 deletions(-) diff --git a/packages/android/src/device.ts b/packages/android/src/device.ts index 30762e077..7f016d5d8 100644 --- a/packages/android/src/device.ts +++ b/packages/android/src/device.ts @@ -713,28 +713,15 @@ ${Object.keys(size) const width = Number.parseInt(match[isLandscape ? 2 : 1], 10); const height = Number.parseInt(match[isLandscape ? 1 : 2], 10); - // Use cached device pixel ratio instead of calling getDisplayDensity() every time - - // Convert physical pixels to logical pixels for consistent coordinate system - // adjustCoordinates() will convert back to physical pixels when needed for touch operations - const logicalWidth = Math.round(width / this.devicePixelRatio); - const logicalHeight = Math.round(height / this.devicePixelRatio); - + // Return physical pixels to match screenshot dimensions + // This ensures AI coordinate conversion uses the same dimensions as the screenshot return { - width: logicalWidth, - height: logicalHeight, + width, + height, dpr: this.devicePixelRatio, }; } - private adjustCoordinates(x: number, y: number): { x: number; y: number } { - const ratio = this.devicePixelRatio; - return { - x: Math.round(x * ratio), - y: Math.round(y * ratio), - }; - } - /** * Calculate the end point for scroll operations based on start point, scroll delta, and screen boundaries. * This method ensures that scroll operations stay within screen bounds and maintain a minimum scroll distance @@ -1183,20 +1170,17 @@ ${Object.keys(size) async mouseClick(x: number, y: number): Promise { const adb = await this.getAdb(); - // Use adjusted coordinates - const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); await adb.shell( - `input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} 150`, + `input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} 150`, ); } async mouseDoubleClick(x: number, y: number): Promise { const adb = await this.getAdb(); - const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); // Use input tap for double-click as it generates proper touch events // that Android can recognize as a double-click gesture - const tapCommand = `input${this.getDisplayArg()} tap ${adjustedX} ${adjustedY}`; + const tapCommand = `input${this.getDisplayArg()} tap ${x} ${y}`; await adb.shell(tapCommand); // Short delay between taps for double-click recognition await sleep(50); @@ -1216,15 +1200,11 @@ ${Object.keys(size) ): Promise { const adb = await this.getAdb(); - // Use adjusted coordinates - const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y); - const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y); - // Ensure duration has a default value const swipeDuration = duration ?? defaultNormalScrollDuration; await adb.shell( - `input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${swipeDuration}`, + `input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${swipeDuration}`, ); } @@ -1264,22 +1244,12 @@ ${Object.keys(size) const endX = startX - deltaX; const endY = startY - deltaY; - // Adjust coordinates to fit device ratio - const { x: adjustedStartX, y: adjustedStartY } = this.adjustCoordinates( - startX, - startY, - ); - const { x: adjustedEndX, y: adjustedEndY } = this.adjustCoordinates( - endX, - endY, - ); - const adb = await this.getAdb(); const swipeDuration = duration ?? defaultNormalScrollDuration; // Execute the swipe operation await adb.shell( - `input${this.getDisplayArg()} swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${swipeDuration}`, + `input${this.getDisplayArg()} swipe ${startX} ${startY} ${endX} ${endY} ${swipeDuration}`, ); } @@ -1320,10 +1290,8 @@ ${Object.keys(size) async longPress(x: number, y: number, duration = 1000): Promise { const adb = await this.getAdb(); - // Use adjusted coordinates - const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); await adb.shell( - `input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} ${duration}`, + `input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} ${duration}`, ); } @@ -1355,13 +1323,9 @@ ${Object.keys(size) ): Promise { const adb = await this.getAdb(); - // Use adjusted coordinates - const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y); - const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y); - // Use the specified duration for better pull gesture recognition await adb.shell( - `input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${duration}`, + `input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${duration}`, ); } diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index f874e8508..272ee6dc0 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,6 +48,7 @@ import { globalConfigManager, globalModelConfigManager, } from '@midscene/shared/env'; +import { resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; // import type { AndroidDeviceInputOpt } from '../device'; @@ -134,6 +135,11 @@ export class Agent< */ private hasWarnedNonVLModel = false; + /** + * Screenshot scale factor for AI model processing + */ + private screenshotScale?: number; + // @deprecated use .interface instead get page() { return this.interface; @@ -176,6 +182,7 @@ export class Agent< ? new ModelConfigManager(opts.modelConfig) : globalModelConfigManager; + this.screenshotScale = opts?.screenshotScale; this.onTaskStartTip = this.opts.onTaskStartTip; this.insight = new Insight(async (action: InsightAction) => { @@ -218,15 +225,48 @@ export class Agent< return this.frozenUIContext; } + // Get original context + let context: UIContext; if (this.interface.getContext) { debug('Using page.getContext for action:', action); - return await this.interface.getContext(); + context = await this.interface.getContext(); } else { debug('Using commonContextParser for action:', action); - return await commonContextParser(this.interface, { + context = await commonContextParser(this.interface, { uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(), }); } + + // Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR + let targetWidth = context.size.width; + let targetHeight = context.size.height; + let needResize = false; + + if (this.screenshotScale && this.screenshotScale !== 1) { + // User-specified scaling ratio + debug(`Applying user screenshot scale: ${this.screenshotScale}`); + targetWidth = Math.round(context.size.width * this.screenshotScale); + targetHeight = Math.round(context.size.height * this.screenshotScale); + needResize = true; + } else if (context.size.dpr && context.size.dpr !== 1) { + // No user-specified scaling, use DPR scaling to logical size + debug( + `Applying DPR scaling: ${context.size.dpr} (resize to logical size)`, + ); + // Target is logical size, no need to change targetWidth/targetHeight + needResize = true; + } + + // Execute scaling + if (needResize) { + debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`); + context.screenshotBase64 = await resizeImgBase64( + context.screenshotBase64, + { width: targetWidth, height: targetHeight }, + ); + } + + return context; } async _snapshotContext(): Promise { @@ -830,9 +870,9 @@ export class Agent< return { rect: element?.rect, center: element?.center, - scale: (await this.interface.size()).dpr, + dpr: (await this.interface.size()).dpr, } as Pick & { - scale: number; + dpr: number; }; } diff --git a/packages/core/src/agent/utils.ts b/packages/core/src/agent/utils.ts index 821247d30..c2cc8a008 100644 --- a/packages/core/src/agent/utils.ts +++ b/packages/core/src/agent/utils.ts @@ -50,21 +50,12 @@ export async function commonContextParser( }); debugProfile('UploadTestInfoToServer end'); - let screenshotBase64 = await interfaceInstance.screenshotBase64(); + const screenshotBase64 = await interfaceInstance.screenshotBase64(); assert(screenshotBase64!, 'screenshotBase64 is required'); const size = await interfaceInstance.size(); debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`); - if (size.dpr && size.dpr !== 1) { - debugProfile('Resizing screenshot for non-1 dpr'); - screenshotBase64 = await resizeImgBase64(screenshotBase64, { - width: size.width, - height: size.height, - }); - debugProfile('ResizeImgBase64 end'); - } - return { tree: { node: null, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index fa920e8b6..9a7e82b13 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -604,4 +604,6 @@ export interface AgentOpt { modelConfig?: TModelConfigFn; cache?: Cache; replanningCycleLimit?: number; + /* Screenshot scaling ratio to reduce image size sent to AI for better performance */ + screenshotScale?: number; } diff --git a/packages/shared/src/img/info.ts b/packages/shared/src/img/info.ts index 8575a3052..0045aca57 100644 --- a/packages/shared/src/img/info.ts +++ b/packages/shared/src/img/info.ts @@ -1,14 +1,9 @@ import assert from 'node:assert'; import { Buffer } from 'node:buffer'; import type Jimp from 'jimp'; +import type { Size } from '../types'; import getJimp from './get-jimp'; -export interface Size { - width: number; - height: number; - dpr?: number; -} - export interface ImageInfo extends Size { jimpImage: Jimp; } diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index 359bdd963..e16e9843d 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -7,9 +7,9 @@ export interface Point { } export interface Size { - width: number; // device independent window size + width: number; // logical pixel size height: number; - dpr?: number; // the scale factor of the screenshots + dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model. } export type Rect = Point & Size & { zoom?: number }; From 275e0d18bcbd799c77c0221ce58b4f8c574b8841 Mon Sep 17 00:00:00 2001 From: yutao Date: Tue, 30 Sep 2025 16:19:08 +0800 Subject: [PATCH 2/7] feat(core): calculate dpr in agent --- packages/core/src/agent/agent.ts | 95 ++++++++++++++++++++++-------- packages/shared/src/types/index.ts | 6 +- 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index 272ee6dc0..765e04968 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -48,7 +48,7 @@ import { globalConfigManager, globalModelConfigManager, } from '@midscene/shared/env'; -import { resizeImgBase64 } from '@midscene/shared/img'; +import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; // import type { AndroidDeviceInputOpt } from '../device'; @@ -136,10 +136,15 @@ export class Agent< private hasWarnedNonVLModel = false; /** - * Screenshot scale factor for AI model processing + * Screenshot scale factor derived from actual screenshot dimensions */ private screenshotScale?: number; + /** + * Internal promise to deduplicate screenshot scale computation + */ + private screenshotScalePromise?: Promise; + // @deprecated use .interface instead get page() { return this.interface; @@ -161,6 +166,52 @@ export class Agent< } } + /** + * Lazily compute the ratio between the physical screenshot width and the logical page width + */ + private async getScreenshotScale(context: UIContext): Promise { + if (this.screenshotScale !== undefined) { + return this.screenshotScale; + } + + if (!this.screenshotScalePromise) { + this.screenshotScalePromise = (async () => { + const pageWidth = context.size?.width; + assert( + pageWidth && pageWidth > 0, + `Invalid page width when computing screenshot scale: ${pageWidth}`, + ); + + const { width: screenshotWidth } = await imageInfoOfBase64( + context.screenshotBase64, + ); + + assert( + Number.isFinite(screenshotWidth) && screenshotWidth > 0, + `Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`, + ); + + const computedScale = screenshotWidth / pageWidth; + assert( + Number.isFinite(computedScale) && computedScale > 0, + `Invalid computed screenshot scale: ${computedScale}`, + ); + + debug( + `Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`, + ); + return computedScale; + })(); + } + + try { + this.screenshotScale = await this.screenshotScalePromise; + return this.screenshotScale; + } finally { + this.screenshotScalePromise = undefined; + } + } + constructor(interfaceInstance: InterfaceType, opts?: AgentOpt) { this.interface = interfaceInstance; this.opts = Object.assign( @@ -182,7 +233,6 @@ export class Agent< ? new ModelConfigManager(opts.modelConfig) : globalModelConfigManager; - this.screenshotScale = opts?.screenshotScale; this.onTaskStartTip = this.opts.onTaskStartTip; this.insight = new Insight(async (action: InsightAction) => { @@ -237,33 +287,22 @@ export class Agent< }); } - // Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR - let targetWidth = context.size.width; - let targetHeight = context.size.height; - let needResize = false; - - if (this.screenshotScale && this.screenshotScale !== 1) { - // User-specified scaling ratio - debug(`Applying user screenshot scale: ${this.screenshotScale}`); - targetWidth = Math.round(context.size.width * this.screenshotScale); - targetHeight = Math.round(context.size.height * this.screenshotScale); - needResize = true; - } else if (context.size.dpr && context.size.dpr !== 1) { - // No user-specified scaling, use DPR scaling to logical size + const computedScreenshotScale = await this.getScreenshotScale(context); + + if (computedScreenshotScale !== 1) { + const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4)); debug( - `Applying DPR scaling: ${context.size.dpr} (resize to logical size)`, + `Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`, ); - // Target is logical size, no need to change targetWidth/targetHeight - needResize = true; - } - - // Execute scaling - if (needResize) { + const targetWidth = Math.round(context.size.width); + const targetHeight = Math.round(context.size.height); debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`); context.screenshotBase64 = await resizeImgBase64( context.screenshotBase64, { width: targetWidth, height: targetHeight }, ); + } else { + debug(`screenshot scale=${computedScreenshotScale}`); } return context; @@ -867,12 +906,18 @@ export class Agent< const { element } = output; + const dprValue = await (this.interface.size() as any).dpr; + const dprEntry = dprValue + ? { + dpr: dprValue, + } + : {}; return { rect: element?.rect, center: element?.center, - dpr: (await this.interface.size()).dpr, + ...dprEntry, } as Pick & { - dpr: number; + dpr?: number; // this field is deprecated }; } diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index e16e9843d..36dd03c49 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -7,9 +7,9 @@ export interface Point { } export interface Size { - width: number; // logical pixel size - height: number; - dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model. + width: number; // The image sent to AI model will be resized to this width. usually you should set it to the logical pixel size + height: number; // The image sent to AI model will be resized to this height. usually you should set it to the logical pixel size + dpr?: number; // this is deprecated, do NOT use it } export type Rect = Point & Size & { zoom?: number }; From 605600403e788d6030222baaee28ffab935ff55c Mon Sep 17 00:00:00 2001 From: yutao Date: Tue, 30 Sep 2025 16:25:11 +0800 Subject: [PATCH 3/7] feat(core): calculate dpr in agent --- packages/shared/src/types/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index 36dd03c49..43655f390 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -7,8 +7,8 @@ export interface Point { } export interface Size { - width: number; // The image sent to AI model will be resized to this width. usually you should set it to the logical pixel size - height: number; // The image sent to AI model will be resized to this height. usually you should set it to the logical pixel size + width: number; // The image sent to AI model will be resized to this width, also the coordinates in the action space will be scaled to the range [0, width]. Usually you should set it to the logical pixel size + height: number; // The image sent to AI model will be resized to this height, also the coordinates in the action space will be scaled to the range [0, height]. Usually you should set it to the logical pixel size dpr?: number; // this is deprecated, do NOT use it } From 858849dfd6d4117c6891e3a5479947243a02ed4b Mon Sep 17 00:00:00 2001 From: quanruzhuoxiu Date: Tue, 30 Sep 2025 17:43:52 +0800 Subject: [PATCH 4/7] feat(android): implement screenshot resizing and coordinate adjustment for AI processing --- packages/android/src/device.ts | 63 ++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/packages/android/src/device.ts b/packages/android/src/device.ts index 7f016d5d8..ce5d1fb16 100644 --- a/packages/android/src/device.ts +++ b/packages/android/src/device.ts @@ -60,6 +60,7 @@ export type AndroidDeviceOpt = { usePhysicalDisplayIdForScreenshot?: boolean; usePhysicalDisplayIdForDisplayLookup?: boolean; customActions?: DeviceAction[]; + screenshotResizeScale?: number; } & AndroidDeviceInputOpt; export class AndroidDevice implements AbstractInterface { @@ -67,6 +68,7 @@ export class AndroidDevice implements AbstractInterface { private yadbPushed = false; private devicePixelRatio = 1; private devicePixelRatioInitialized = false; + private scalingRatio = 1; // Record scaling ratio for coordinate adjustment private adb: ADB | null = null; private connectingAdb: Promise | null = null; private destroyed = false; @@ -713,12 +715,28 @@ ${Object.keys(size) const width = Number.parseInt(match[isLandscape ? 2 : 1], 10); const height = Number.parseInt(match[isLandscape ? 1 : 2], 10); - // Return physical pixels to match screenshot dimensions - // This ensures AI coordinate conversion uses the same dimensions as the screenshot + // Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio + // Default is 1/dpr to scale down by device pixel ratio (e.g., dpr=3 -> scale=1/3) + const scale = + this.options?.screenshotResizeScale ?? 1 / this.devicePixelRatio; + this.scalingRatio = scale; + + // Apply scale to get logical dimensions for AI processing + // adjustCoordinates() will convert back to physical pixels when needed for touch operations + const logicalWidth = Math.round(width * scale); + const logicalHeight = Math.round(height * scale); + + return { + width: logicalWidth, + height: logicalHeight, + }; + } + + private adjustCoordinates(x: number, y: number): { x: number; y: number } { + const scale = this.scalingRatio; return { - width, - height, - dpr: this.devicePixelRatio, + x: Math.round(x / scale), + y: Math.round(y / scale), }; } @@ -1170,17 +1188,20 @@ ${Object.keys(size) async mouseClick(x: number, y: number): Promise { const adb = await this.getAdb(); + // Use adjusted coordinates + const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); await adb.shell( - `input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} 150`, + `input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} 150`, ); } async mouseDoubleClick(x: number, y: number): Promise { const adb = await this.getAdb(); + const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); // Use input tap for double-click as it generates proper touch events // that Android can recognize as a double-click gesture - const tapCommand = `input${this.getDisplayArg()} tap ${x} ${y}`; + const tapCommand = `input${this.getDisplayArg()} tap ${adjustedX} ${adjustedY}`; await adb.shell(tapCommand); // Short delay between taps for double-click recognition await sleep(50); @@ -1200,11 +1221,15 @@ ${Object.keys(size) ): Promise { const adb = await this.getAdb(); + // Use adjusted coordinates + const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y); + const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y); + // Ensure duration has a default value const swipeDuration = duration ?? defaultNormalScrollDuration; await adb.shell( - `input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${swipeDuration}`, + `input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${swipeDuration}`, ); } @@ -1244,12 +1269,22 @@ ${Object.keys(size) const endX = startX - deltaX; const endY = startY - deltaY; + // Adjust coordinates to fit device ratio + const { x: adjustedStartX, y: adjustedStartY } = this.adjustCoordinates( + startX, + startY, + ); + const { x: adjustedEndX, y: adjustedEndY } = this.adjustCoordinates( + endX, + endY, + ); + const adb = await this.getAdb(); const swipeDuration = duration ?? defaultNormalScrollDuration; // Execute the swipe operation await adb.shell( - `input${this.getDisplayArg()} swipe ${startX} ${startY} ${endX} ${endY} ${swipeDuration}`, + `input${this.getDisplayArg()} swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${swipeDuration}`, ); } @@ -1290,8 +1325,10 @@ ${Object.keys(size) async longPress(x: number, y: number, duration = 1000): Promise { const adb = await this.getAdb(); + // Use adjusted coordinates + const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y); await adb.shell( - `input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} ${duration}`, + `input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} ${duration}`, ); } @@ -1323,9 +1360,13 @@ ${Object.keys(size) ): Promise { const adb = await this.getAdb(); + // Use adjusted coordinates + const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y); + const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y); + // Use the specified duration for better pull gesture recognition await adb.shell( - `input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${duration}`, + `input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${duration}`, ); } From 927132f8928cdb65885ce47d5800a44dd22d8dbe Mon Sep 17 00:00:00 2001 From: quanruzhuoxiu Date: Mon, 6 Oct 2025 12:46:01 +0800 Subject: [PATCH 5/7] feat(ios, android): add screenshot resize scale option for Android and iOS agents --- .husky/commit-msg | 7 ++- apps/site/docs/en/integrate-with-android.mdx | 1 + apps/site/docs/en/integrate-with-ios.mdx | 1 + apps/site/docs/zh/integrate-with-android.mdx | 1 + apps/site/docs/zh/integrate-with-ios.mdx | 1 + packages/android/src/agent.ts | 1 + packages/ios/src/agent.ts | 1 + packages/ios/src/device.ts | 53 +++++++++++++++++--- 8 files changed, 58 insertions(+), 8 deletions(-) mode change 100644 => 100755 .husky/commit-msg diff --git a/.husky/commit-msg b/.husky/commit-msg old mode 100644 new mode 100755 index e9982a6d9..b054e965d --- a/.husky/commit-msg +++ b/.husky/commit-msg @@ -1 +1,6 @@ -npx --no -- commitlint --edit "$1" \ No newline at end of file +#!/bin/sh + +# Ensure node is in PATH (for fnm users) +export PATH="$HOME/Library/Application Support/fnm/aliases/default/bin:$PATH" + +npx --no -- commitlint --edit "$1" diff --git a/apps/site/docs/en/integrate-with-android.mdx b/apps/site/docs/en/integrate-with-android.mdx index 80472a79c..589eb4a76 100644 --- a/apps/site/docs/en/integrate-with-android.mdx +++ b/apps/site/docs/en/integrate-with-android.mdx @@ -128,6 +128,7 @@ The AndroidDevice constructor supports the following parameters: - `remoteAdbPort?: number` - Optional, the remote adb port. - `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - Optional, when should Midscene invoke [yadb](https://github.com/ysbing/YADB) to input texts. (Default: 'always-yadb') - `displayId?: number` - Optional, the display id to use. (Default: undefined, means use the current display) + - `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is `1 / devicePixelRatio` (automatically scaled based on device pixel ratio). ### Additional Android Agent Interfaces diff --git a/apps/site/docs/en/integrate-with-ios.mdx b/apps/site/docs/en/integrate-with-ios.mdx index a704e2c97..0cd5d5e38 100644 --- a/apps/site/docs/en/integrate-with-ios.mdx +++ b/apps/site/docs/en/integrate-with-ios.mdx @@ -151,6 +151,7 @@ The IOSDevice constructor supports the following parameters: - `wdaPort?: number` - Optional, WebDriverAgent port. Default is 8100. - `wdaHost?: string` - Optional, WebDriverAgent host. Default is 'localhost'. - `autoDismissKeyboard?: boolean` - Optional, whether to automatically dismiss keyboard after text input. Default is true. + - `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is 1 (no scaling). - `customActions?: DeviceAction[]` - Optional, list of custom device actions. ### Additional iOS Agent Interfaces diff --git a/apps/site/docs/zh/integrate-with-android.mdx b/apps/site/docs/zh/integrate-with-android.mdx index 2800c75ca..4b94bd697 100644 --- a/apps/site/docs/zh/integrate-with-android.mdx +++ b/apps/site/docs/zh/integrate-with-android.mdx @@ -127,6 +127,7 @@ AndroidDevice 的构造函数支持以下参数: - `remoteAdbPort?: number` - 可选参数,用于指定远程 adb 端口。 - `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - 可选参数,控制 Midscene 何时调用 [yadb](https://github.com/ysbing/YADB) 来输入文本。默认值为 'always-yadb'。 - `displayId?: number` - 可选参数,用于指定要使用的显示器 ID。默认值为 undefined,表示使用当前显示器。 + - `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 `1 / devicePixelRatio`(根据设备像素比自动缩放)。 ### Android Agent 上的更多接口 diff --git a/apps/site/docs/zh/integrate-with-ios.mdx b/apps/site/docs/zh/integrate-with-ios.mdx index 25cdae85f..9c3f04ad7 100644 --- a/apps/site/docs/zh/integrate-with-ios.mdx +++ b/apps/site/docs/zh/integrate-with-ios.mdx @@ -199,6 +199,7 @@ IOSDevice 的构造函数支持以下参数: - `wdaPort?: number` - 可选参数,WebDriverAgent 端口。默认值为 8100。 - `wdaHost?: string` - 可选参数,WebDriverAgent 主机。默认值为 'localhost'。 - `autoDismissKeyboard?: boolean` - 可选参数,是否在输入文本后自动关闭键盘。默认值为 true。 + - `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 1(不缩放)。 - `customActions?: DeviceAction[]` - 可选参数,自定义设备动作列表。 ### iOS Agent 上的更多接口 diff --git a/packages/android/src/agent.ts b/packages/android/src/agent.ts index 4c2c3bf48..76d0390e6 100644 --- a/packages/android/src/agent.ts +++ b/packages/android/src/agent.ts @@ -44,6 +44,7 @@ export async function agentFromAdbDevice( usePhysicalDisplayIdForScreenshot: opts?.usePhysicalDisplayIdForScreenshot, usePhysicalDisplayIdForDisplayLookup: opts?.usePhysicalDisplayIdForDisplayLookup, + screenshotResizeScale: opts?.screenshotResizeScale, }); await device.connect(); diff --git a/packages/ios/src/agent.ts b/packages/ios/src/agent.ts index 2b2bf5a99..65fe8022c 100644 --- a/packages/ios/src/agent.ts +++ b/packages/ios/src/agent.ts @@ -31,6 +31,7 @@ export async function agentFromWebDriverAgent( wdaPort: opts?.wdaPort, wdaHost: opts?.wdaHost, useWDA: opts?.useWDA, + screenshotResizeScale: opts?.screenshotResizeScale, }); await device.connect(); diff --git a/packages/ios/src/device.ts b/packages/ios/src/device.ts index c13146053..a40153bea 100644 --- a/packages/ios/src/device.ts +++ b/packages/ios/src/device.ts @@ -37,12 +37,14 @@ export type IOSDeviceOpt = { wdaPort?: number; wdaHost?: string; useWDA?: boolean; + screenshotResizeScale?: number; } & IOSDeviceInputOpt; export class IOSDevice implements AbstractInterface { private deviceId: string; private devicePixelRatio = 1; private devicePixelRatioInitialized = false; + private scalingRatio = 1; // Record scaling ratio for coordinate adjustment private destroyed = false; private description: string | undefined; private customActions?: DeviceAction[]; @@ -341,12 +343,35 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) } async size(): Promise { + // Ensure device pixel ratio is initialized first + await this.initializeDevicePixelRatio(); + const screenSize = await this.getScreenSize(); + // Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio + // Default is 1 + const scale = this.options?.screenshotResizeScale ?? 1; + this.scalingRatio = scale; + + // Apply scale to get logical dimensions for AI processing + const logicalWidth = Math.round(screenSize.width * scale); + const logicalHeight = Math.round(screenSize.height * scale); + + debugDevice( + `size() - screenSize: ${screenSize.width}x${screenSize.height}, scale: ${scale}, logicalSize: ${logicalWidth}x${logicalHeight}`, + ); + return { - width: screenSize.width, - height: screenSize.height, - dpr: screenSize.scale, + width: logicalWidth, + height: logicalHeight, + }; + } + + private adjustCoordinates(x: number, y: number): { x: number; y: number } { + const scale = this.scalingRatio; + return { + x: Math.round(x / scale), + y: Math.round(y / scale), }; } @@ -399,7 +424,11 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) // Core interaction methods async tap(x: number, y: number): Promise { - await this.wdaBackend.tap(x, y); + const adjusted = this.adjustCoordinates(x, y); + debugDevice( + `tap at coordinates - input: (${x}, ${y}), adjusted: (${adjusted.x}, ${adjusted.y}), scale: ${this.scalingRatio}`, + ); + await this.wdaBackend.tap(adjusted.x, adjusted.y); } // Android-compatible method name @@ -409,11 +438,13 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) } async doubleTap(x: number, y: number): Promise { - await this.wdaBackend.doubleTap(x, y); + const adjusted = this.adjustCoordinates(x, y); + await this.wdaBackend.doubleTap(adjusted.x, adjusted.y); } async longPress(x: number, y: number, duration = 1000): Promise { - await this.wdaBackend.longPress(x, y, duration); + const adjusted = this.adjustCoordinates(x, y); + await this.wdaBackend.longPress(adjusted.x, adjusted.y, duration); } async swipe( @@ -423,7 +454,15 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) toY: number, duration = 500, ): Promise { - await this.wdaBackend.swipe(fromX, fromY, toX, toY, duration); + const adjustedFrom = this.adjustCoordinates(fromX, fromY); + const adjustedTo = this.adjustCoordinates(toX, toY); + await this.wdaBackend.swipe( + adjustedFrom.x, + adjustedFrom.y, + adjustedTo.x, + adjustedTo.y, + duration, + ); } async typeText(text: string, options?: IOSDeviceInputOpt): Promise { From 6ec0eb1841fbcfbb3b5f15ef19dc5ade803023fc Mon Sep 17 00:00:00 2001 From: quanruzhuoxiu Date: Mon, 6 Oct 2025 13:26:16 +0800 Subject: [PATCH 6/7] refactor(tests): remove dpr from size assertions in Android and iOS device tests --- packages/android/tests/unit-test/page.test.ts | 4 ++-- packages/ios/tests/unit-test/device.test.ts | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/packages/android/tests/unit-test/page.test.ts b/packages/android/tests/unit-test/page.test.ts index 93551fa24..84f37ba14 100644 --- a/packages/android/tests/unit-test/page.test.ts +++ b/packages/android/tests/unit-test/page.test.ts @@ -125,7 +125,7 @@ describe('AndroidDevice', () => { const size1 = await device.size(); const size2 = await device.size(); - expect(size1).toEqual({ width: 540, height: 960, dpr: 2 }); + expect(size1).toEqual({ width: 540, height: 960 }); expect(size2).toEqual(size1); // Caching is removed, so it should be called twice expect(vi.spyOn(device as any, 'getScreenSize')).toHaveBeenCalledTimes(2); @@ -1343,7 +1343,7 @@ describe('AndroidDevice', () => { expect(mockAdbInstance.shell).toHaveBeenCalledWith('dumpsys display'); expect(size.width).toBe(411); // 1080 / (420/160) ≈ 411 expect(size.height).toBe(731); // 1920 / (420/160) ≈ 731 - expect(size.dpr).toBe(2.625); // 420 / 160 = 2.625 + // dpr is no longer returned in size() }); it('should use display ID for screenshots by default when displayId is set', async () => { diff --git a/packages/ios/tests/unit-test/device.test.ts b/packages/ios/tests/unit-test/device.test.ts index b827b11ee..4973097ce 100644 --- a/packages/ios/tests/unit-test/device.test.ts +++ b/packages/ios/tests/unit-test/device.test.ts @@ -186,7 +186,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); expect(mockWdaClient.getWindowSize).toHaveBeenCalled(); }); @@ -298,7 +297,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); }); @@ -472,11 +470,6 @@ describe('IOSDevice', () => { await device.connect(); }); - it('should calculate DPR correctly', async () => { - const size = await device.size(); - expect(size.dpr).toBe(2); // DPR from mocked getScreenScale - }); - it('should handle different screen sizes', async () => { mockWdaClient.getWindowSize = vi .fn() From 4cd4a179275f04dededbe15c27abc2da19d57dff Mon Sep 17 00:00:00 2001 From: quanruzhuoxiu Date: Mon, 6 Oct 2025 13:50:04 +0800 Subject: [PATCH 7/7] fix(tests): update screenshotBase64 values in freeze context tests for accuracy --- .../tests/unit-test/freeze-context.test.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/packages/web-integration/tests/unit-test/freeze-context.test.ts b/packages/web-integration/tests/unit-test/freeze-context.test.ts index d6d7e5d12..bb111082a 100644 --- a/packages/web-integration/tests/unit-test/freeze-context.test.ts +++ b/packages/web-integration/tests/unit-test/freeze-context.test.ts @@ -15,8 +15,8 @@ const mockPage = { evaluateJavaScript: vi.fn(), size: vi.fn().mockResolvedValue({ width: 1920, height: 1080, dpr: 1 }), url: vi.fn().mockResolvedValue('https://example.com'), - getContext: vi.fn().mockImplementation(async function () { - return await WebPageContextParser(this); + getContext: vi.fn().mockImplementation(async function (this: WebPage) { + return await WebPageContextParser(this, {}); }), } as unknown as WebPage; @@ -31,7 +31,8 @@ describe('PageAgent freeze/unfreeze page context', () => { // Create mock contexts mockContext = { size: { width: 1920, height: 1080, dpr: 1 }, - screenshotBase64: 'mock-screenshot-base64-1', + screenshotBase64: + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==', tree: [ { id: 'element1', @@ -46,7 +47,8 @@ describe('PageAgent freeze/unfreeze page context', () => { mockContext2 = { size: { width: 1920, height: 1080, dpr: 1 }, - screenshotBase64: 'mock-screenshot-base64-2', + screenshotBase64: + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==', tree: [ { id: 'element2', @@ -138,7 +140,9 @@ describe('PageAgent freeze/unfreeze page context', () => { // Frozen context should be marked const frozenContext = (agent as any).frozenUIContext; expect(frozenContext._isFrozen).toBe(true); - expect(frozenContext.screenshotBase64).toBe(mockContext.screenshotBase64); + expect(frozenContext.screenshotBase64).toBe( + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==', + ); expect(frozenContext.tree).toBe(mockContext.tree); });