Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .husky/commit-msg
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
npx --no -- commitlint --edit "$1"
#!/bin/sh

# Ensure node is in PATH (for fnm users)
export PATH="$HOME/Library/Application Support/fnm/aliases/default/bin:$PATH"

npx --no -- commitlint --edit "$1"
1 change: 1 addition & 0 deletions apps/site/docs/en/integrate-with-android.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ The AndroidDevice constructor supports the following parameters:
- `remoteAdbPort?: number` - Optional, the remote adb port.
- `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - Optional, when should Midscene invoke [yadb](https://github.com/ysbing/YADB) to input texts. (Default: 'always-yadb')
- `displayId?: number` - Optional, the display id to use. (Default: undefined, means use the current display)
- `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is `1 / devicePixelRatio` (automatically scaled based on device pixel ratio).

### Additional Android Agent Interfaces

Expand Down
1 change: 1 addition & 0 deletions apps/site/docs/en/integrate-with-ios.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ The IOSDevice constructor supports the following parameters:
- `wdaPort?: number` - Optional, WebDriverAgent port. Default is 8100.
- `wdaHost?: string` - Optional, WebDriverAgent host. Default is 'localhost'.
- `autoDismissKeyboard?: boolean` - Optional, whether to automatically dismiss keyboard after text input. Default is true.
- `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is 1 (no scaling).
- `customActions?: DeviceAction<any>[]` - Optional, list of custom device actions.

### Additional iOS Agent Interfaces
Expand Down
1 change: 1 addition & 0 deletions apps/site/docs/zh/integrate-with-android.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ AndroidDevice 的构造函数支持以下参数:
- `remoteAdbPort?: number` - 可选参数,用于指定远程 adb 端口。
- `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - 可选参数,控制 Midscene 何时调用 [yadb](https://github.com/ysbing/YADB) 来输入文本。默认值为 'always-yadb'。
- `displayId?: number` - 可选参数,用于指定要使用的显示器 ID。默认值为 undefined,表示使用当前显示器。
- `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 `1 / devicePixelRatio`(根据设备像素比自动缩放)。

### Android Agent 上的更多接口

Expand Down
1 change: 1 addition & 0 deletions apps/site/docs/zh/integrate-with-ios.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ IOSDevice 的构造函数支持以下参数:
- `wdaPort?: number` - 可选参数,WebDriverAgent 端口。默认值为 8100。
- `wdaHost?: string` - 可选参数,WebDriverAgent 主机。默认值为 'localhost'。
- `autoDismissKeyboard?: boolean` - 可选参数,是否在输入文本后自动关闭键盘。默认值为 true。
- `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 1(不缩放)。
- `customActions?: DeviceAction<any>[]` - 可选参数,自定义设备动作列表。

### iOS Agent 上的更多接口
Expand Down
1 change: 1 addition & 0 deletions packages/android/src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ export async function agentFromAdbDevice(
usePhysicalDisplayIdForScreenshot: opts?.usePhysicalDisplayIdForScreenshot,
usePhysicalDisplayIdForDisplayLookup:
opts?.usePhysicalDisplayIdForDisplayLookup,
screenshotResizeScale: opts?.screenshotResizeScale,
});

await device.connect();
Expand Down
21 changes: 13 additions & 8 deletions packages/android/src/device.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@ export type AndroidDeviceOpt = {
usePhysicalDisplayIdForScreenshot?: boolean;
usePhysicalDisplayIdForDisplayLookup?: boolean;
customActions?: DeviceAction<any>[];
screenshotResizeScale?: number;
} & AndroidDeviceInputOpt;

export class AndroidDevice implements AbstractInterface {
private deviceId: string;
private yadbPushed = false;
private devicePixelRatio = 1;
private devicePixelRatioInitialized = false;
private scalingRatio = 1; // Record scaling ratio for coordinate adjustment
private adb: ADB | null = null;
private connectingAdb: Promise<ADB> | null = null;
private destroyed = false;
Expand Down Expand Up @@ -713,25 +715,28 @@ ${Object.keys(size)
const width = Number.parseInt(match[isLandscape ? 2 : 1], 10);
const height = Number.parseInt(match[isLandscape ? 1 : 2], 10);

// Use cached device pixel ratio instead of calling getDisplayDensity() every time
// Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio
// Default is 1/dpr to scale down by device pixel ratio (e.g., dpr=3 -> scale=1/3)
const scale =
this.options?.screenshotResizeScale ?? 1 / this.devicePixelRatio;
this.scalingRatio = scale;

// Convert physical pixels to logical pixels for consistent coordinate system
// Apply scale to get logical dimensions for AI processing
// adjustCoordinates() will convert back to physical pixels when needed for touch operations
const logicalWidth = Math.round(width / this.devicePixelRatio);
const logicalHeight = Math.round(height / this.devicePixelRatio);
const logicalWidth = Math.round(width * scale);
const logicalHeight = Math.round(height * scale);

return {
width: logicalWidth,
height: logicalHeight,
dpr: this.devicePixelRatio,
};
}

private adjustCoordinates(x: number, y: number): { x: number; y: number } {
const ratio = this.devicePixelRatio;
const scale = this.scalingRatio;
return {
x: Math.round(x * ratio),
y: Math.round(y * ratio),
x: Math.round(x / scale),
y: Math.round(y / scale),
};
}

Expand Down
4 changes: 2 additions & 2 deletions packages/android/tests/unit-test/page.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ describe('AndroidDevice', () => {
const size1 = await device.size();
const size2 = await device.size();

expect(size1).toEqual({ width: 540, height: 960, dpr: 2 });
expect(size1).toEqual({ width: 540, height: 960 });
expect(size2).toEqual(size1);
// Caching is removed, so it should be called twice
expect(vi.spyOn(device as any, 'getScreenSize')).toHaveBeenCalledTimes(2);
Expand Down Expand Up @@ -1343,7 +1343,7 @@ describe('AndroidDevice', () => {
expect(mockAdbInstance.shell).toHaveBeenCalledWith('dumpsys display');
expect(size.width).toBe(411); // 1080 / (420/160) ≈ 411
expect(size.height).toBe(731); // 1920 / (420/160) ≈ 731
expect(size.dpr).toBe(2.625); // 420 / 160 = 2.625
// dpr is no longer returned in size()
});

it('should use display ID for screenshots by default when displayId is set', async () => {
Expand Down
93 changes: 89 additions & 4 deletions packages/core/src/agent/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ import {
globalConfigManager,
globalModelConfigManager,
} from '@midscene/shared/env';
import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
// import type { AndroidDeviceInputOpt } from '../device';
Expand Down Expand Up @@ -134,6 +135,16 @@ export class Agent<
*/
private hasWarnedNonVLModel = false;

/**
* Screenshot scale factor derived from actual screenshot dimensions
*/
private screenshotScale?: number;

/**
* Internal promise to deduplicate screenshot scale computation
*/
private screenshotScalePromise?: Promise<number>;

// @deprecated use .interface instead
get page() {
return this.interface;
Expand All @@ -155,6 +166,52 @@ export class Agent<
}
}

/**
* Lazily compute the ratio between the physical screenshot width and the logical page width
*/
private async getScreenshotScale(context: UIContext): Promise<number> {
if (this.screenshotScale !== undefined) {
return this.screenshotScale;
}

if (!this.screenshotScalePromise) {
this.screenshotScalePromise = (async () => {
const pageWidth = context.size?.width;
assert(
pageWidth && pageWidth > 0,
`Invalid page width when computing screenshot scale: ${pageWidth}`,
);

const { width: screenshotWidth } = await imageInfoOfBase64(
context.screenshotBase64,
);

assert(
Number.isFinite(screenshotWidth) && screenshotWidth > 0,
`Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`,
);

const computedScale = screenshotWidth / pageWidth;
assert(
Number.isFinite(computedScale) && computedScale > 0,
`Invalid computed screenshot scale: ${computedScale}`,
);

debug(
`Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`,
);
return computedScale;
})();
}

try {
this.screenshotScale = await this.screenshotScalePromise;
return this.screenshotScale;
} finally {
this.screenshotScalePromise = undefined;
}
}

constructor(interfaceInstance: InterfaceType, opts?: AgentOpt) {
this.interface = interfaceInstance;
this.opts = Object.assign(
Expand Down Expand Up @@ -218,15 +275,37 @@ export class Agent<
return this.frozenUIContext;
}

// Get original context
let context: UIContext;
if (this.interface.getContext) {
debug('Using page.getContext for action:', action);
return await this.interface.getContext();
context = await this.interface.getContext();
} else {
debug('Using commonContextParser for action:', action);
return await commonContextParser(this.interface, {
context = await commonContextParser(this.interface, {
uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(),
});
}

const computedScreenshotScale = await this.getScreenshotScale(context);

if (computedScreenshotScale !== 1) {
const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4));
debug(
`Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`,
);
const targetWidth = Math.round(context.size.width);
const targetHeight = Math.round(context.size.height);
debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`);
context.screenshotBase64 = await resizeImgBase64(
context.screenshotBase64,
{ width: targetWidth, height: targetHeight },
);
} else {
debug(`screenshot scale=${computedScreenshotScale}`);
}

return context;
}

async _snapshotContext(): Promise<UIContext> {
Expand Down Expand Up @@ -827,12 +906,18 @@ export class Agent<

const { element } = output;

const dprValue = await (this.interface.size() as any).dpr;
const dprEntry = dprValue
? {
dpr: dprValue,
}
: {};
return {
rect: element?.rect,
center: element?.center,
scale: (await this.interface.size()).dpr,
...dprEntry,
} as Pick<LocateResultElement, 'rect' | 'center'> & {
scale: number;
dpr?: number; // this field is deprecated
};
}

Expand Down
11 changes: 1 addition & 10 deletions packages/core/src/agent/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,12 @@ export async function commonContextParser(
});
debugProfile('UploadTestInfoToServer end');

let screenshotBase64 = await interfaceInstance.screenshotBase64();
const screenshotBase64 = await interfaceInstance.screenshotBase64();
assert(screenshotBase64!, 'screenshotBase64 is required');

const size = await interfaceInstance.size();
debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);

if (size.dpr && size.dpr !== 1) {
debugProfile('Resizing screenshot for non-1 dpr');
screenshotBase64 = await resizeImgBase64(screenshotBase64, {
width: size.width,
height: size.height,
});
debugProfile('ResizeImgBase64 end');
}

return {
tree: {
node: null,
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -604,4 +604,6 @@ export interface AgentOpt {
modelConfig?: TModelConfigFn;
cache?: Cache;
replanningCycleLimit?: number;
/* Screenshot scaling ratio to reduce image size sent to AI for better performance */
screenshotScale?: number;
}
1 change: 1 addition & 0 deletions packages/ios/src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ export async function agentFromWebDriverAgent(
wdaPort: opts?.wdaPort,
wdaHost: opts?.wdaHost,
useWDA: opts?.useWDA,
screenshotResizeScale: opts?.screenshotResizeScale,
});

await device.connect();
Expand Down
53 changes: 46 additions & 7 deletions packages/ios/src/device.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ export type IOSDeviceOpt = {
wdaPort?: number;
wdaHost?: string;
useWDA?: boolean;
screenshotResizeScale?: number;
} & IOSDeviceInputOpt;

export class IOSDevice implements AbstractInterface {
private deviceId: string;
private devicePixelRatio = 1;
private devicePixelRatioInitialized = false;
private scalingRatio = 1; // Record scaling ratio for coordinate adjustment
private destroyed = false;
private description: string | undefined;
private customActions?: DeviceAction<any>[];
Expand Down Expand Up @@ -341,12 +343,35 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
}

async size(): Promise<Size> {
// Ensure device pixel ratio is initialized first
await this.initializeDevicePixelRatio();

const screenSize = await this.getScreenSize();

// Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio
// Default is 1
const scale = this.options?.screenshotResizeScale ?? 1;
this.scalingRatio = scale;

// Apply scale to get logical dimensions for AI processing
const logicalWidth = Math.round(screenSize.width * scale);
const logicalHeight = Math.round(screenSize.height * scale);

debugDevice(
`size() - screenSize: ${screenSize.width}x${screenSize.height}, scale: ${scale}, logicalSize: ${logicalWidth}x${logicalHeight}`,
);

return {
width: screenSize.width,
height: screenSize.height,
dpr: screenSize.scale,
width: logicalWidth,
height: logicalHeight,
};
}

private adjustCoordinates(x: number, y: number): { x: number; y: number } {
const scale = this.scalingRatio;
return {
x: Math.round(x / scale),
y: Math.round(y / scale),
};
}

Expand Down Expand Up @@ -399,7 +424,11 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})

// Core interaction methods
async tap(x: number, y: number): Promise<void> {
await this.wdaBackend.tap(x, y);
const adjusted = this.adjustCoordinates(x, y);
debugDevice(
`tap at coordinates - input: (${x}, ${y}), adjusted: (${adjusted.x}, ${adjusted.y}), scale: ${this.scalingRatio}`,
);
await this.wdaBackend.tap(adjusted.x, adjusted.y);
}

// Android-compatible method name
Expand All @@ -409,11 +438,13 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
}

async doubleTap(x: number, y: number): Promise<void> {
await this.wdaBackend.doubleTap(x, y);
const adjusted = this.adjustCoordinates(x, y);
await this.wdaBackend.doubleTap(adjusted.x, adjusted.y);
}

async longPress(x: number, y: number, duration = 1000): Promise<void> {
await this.wdaBackend.longPress(x, y, duration);
const adjusted = this.adjustCoordinates(x, y);
await this.wdaBackend.longPress(adjusted.x, adjusted.y, duration);
}

async swipe(
Expand All @@ -423,7 +454,15 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
toY: number,
duration = 500,
): Promise<void> {
await this.wdaBackend.swipe(fromX, fromY, toX, toY, duration);
const adjustedFrom = this.adjustCoordinates(fromX, fromY);
const adjustedTo = this.adjustCoordinates(toX, toY);
await this.wdaBackend.swipe(
adjustedFrom.x,
adjustedFrom.y,
adjustedTo.x,
adjustedTo.y,
duration,
);
}

async typeText(text: string, options?: IOSDeviceInputOpt): Promise<void> {
Expand Down
Loading