Skip to content

Commit 927132f

Browse files
committed
feat(ios, android): add screenshot resize scale option for Android and iOS agents
1 parent 858849d commit 927132f

File tree

8 files changed

+58
-8
lines changed

8 files changed

+58
-8
lines changed

.husky/commit-msg

100644100755
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,6 @@
1-
npx --no -- commitlint --edit "$1"
1+
#!/bin/sh
2+
3+
# Ensure node is in PATH (for fnm users)
4+
export PATH="$HOME/Library/Application Support/fnm/aliases/default/bin:$PATH"
5+
6+
npx --no -- commitlint --edit "$1"

apps/site/docs/en/integrate-with-android.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ The AndroidDevice constructor supports the following parameters:
128128
- `remoteAdbPort?: number` - Optional, the remote adb port.
129129
- `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - Optional, when should Midscene invoke [yadb](https://github.com/ysbing/YADB) to input texts. (Default: 'always-yadb')
130130
- `displayId?: number` - Optional, the display id to use. (Default: undefined, means use the current display)
131+
- `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is `1 / devicePixelRatio` (automatically scaled based on device pixel ratio).
131132

132133
### Additional Android Agent Interfaces
133134

apps/site/docs/en/integrate-with-ios.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ The IOSDevice constructor supports the following parameters:
151151
- `wdaPort?: number` - Optional, WebDriverAgent port. Default is 8100.
152152
- `wdaHost?: string` - Optional, WebDriverAgent host. Default is 'localhost'.
153153
- `autoDismissKeyboard?: boolean` - Optional, whether to automatically dismiss keyboard after text input. Default is true.
154+
- `screenshotResizeScale?: number` - Optional, screenshot resize scale. For example, 0.5 means resize the screenshot to 50% of its original size, which can reduce AI processing image size and improve response speed. Default is 1 (no scaling).
154155
- `customActions?: DeviceAction<any>[]` - Optional, list of custom device actions.
155156

156157
### Additional iOS Agent Interfaces

apps/site/docs/zh/integrate-with-android.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ AndroidDevice 的构造函数支持以下参数:
127127
- `remoteAdbPort?: number` - 可选参数,用于指定远程 adb 端口。
128128
- `imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii'` - 可选参数,控制 Midscene 何时调用 [yadb](https://github.com/ysbing/YADB) 来输入文本。默认值为 'always-yadb'。
129129
- `displayId?: number` - 可选参数,用于指定要使用的显示器 ID。默认值为 undefined,表示使用当前显示器。
130+
- `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 `1 / devicePixelRatio`(根据设备像素比自动缩放)。
130131

131132
### Android Agent 上的更多接口
132133

apps/site/docs/zh/integrate-with-ios.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ IOSDevice 的构造函数支持以下参数:
199199
- `wdaPort?: number` - 可选参数,WebDriverAgent 端口。默认值为 8100。
200200
- `wdaHost?: string` - 可选参数,WebDriverAgent 主机。默认值为 'localhost'。
201201
- `autoDismissKeyboard?: boolean` - 可选参数,是否在输入文本后自动关闭键盘。默认值为 true。
202+
- `screenshotResizeScale?: number` - 可选参数,截图缩放比例。例如 0.5 表示将截图缩小到原来的 50%,可以减少 AI 处理的图片大小,提升响应速度。默认值为 1(不缩放)。
202203
- `customActions?: DeviceAction<any>[]` - 可选参数,自定义设备动作列表。
203204

204205
### iOS Agent 上的更多接口

packages/android/src/agent.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ export async function agentFromAdbDevice(
4444
usePhysicalDisplayIdForScreenshot: opts?.usePhysicalDisplayIdForScreenshot,
4545
usePhysicalDisplayIdForDisplayLookup:
4646
opts?.usePhysicalDisplayIdForDisplayLookup,
47+
screenshotResizeScale: opts?.screenshotResizeScale,
4748
});
4849

4950
await device.connect();

packages/ios/src/agent.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ export async function agentFromWebDriverAgent(
3131
wdaPort: opts?.wdaPort,
3232
wdaHost: opts?.wdaHost,
3333
useWDA: opts?.useWDA,
34+
screenshotResizeScale: opts?.screenshotResizeScale,
3435
});
3536

3637
await device.connect();

packages/ios/src/device.ts

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,14 @@ export type IOSDeviceOpt = {
3737
wdaPort?: number;
3838
wdaHost?: string;
3939
useWDA?: boolean;
40+
screenshotResizeScale?: number;
4041
} & IOSDeviceInputOpt;
4142

4243
export class IOSDevice implements AbstractInterface {
4344
private deviceId: string;
4445
private devicePixelRatio = 1;
4546
private devicePixelRatioInitialized = false;
47+
private scalingRatio = 1; // Record scaling ratio for coordinate adjustment
4648
private destroyed = false;
4749
private description: string | undefined;
4850
private customActions?: DeviceAction<any>[];
@@ -341,12 +343,35 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
341343
}
342344

343345
async size(): Promise<Size> {
346+
// Ensure device pixel ratio is initialized first
347+
await this.initializeDevicePixelRatio();
348+
344349
const screenSize = await this.getScreenSize();
345350

351+
// Determine scaling: use screenshotResizeScale if provided, otherwise use 1/devicePixelRatio
352+
// Default is 1
353+
const scale = this.options?.screenshotResizeScale ?? 1;
354+
this.scalingRatio = scale;
355+
356+
// Apply scale to get logical dimensions for AI processing
357+
const logicalWidth = Math.round(screenSize.width * scale);
358+
const logicalHeight = Math.round(screenSize.height * scale);
359+
360+
debugDevice(
361+
`size() - screenSize: ${screenSize.width}x${screenSize.height}, scale: ${scale}, logicalSize: ${logicalWidth}x${logicalHeight}`,
362+
);
363+
346364
return {
347-
width: screenSize.width,
348-
height: screenSize.height,
349-
dpr: screenSize.scale,
365+
width: logicalWidth,
366+
height: logicalHeight,
367+
};
368+
}
369+
370+
private adjustCoordinates(x: number, y: number): { x: number; y: number } {
371+
const scale = this.scalingRatio;
372+
return {
373+
x: Math.round(x / scale),
374+
y: Math.round(y / scale),
350375
};
351376
}
352377

@@ -399,7 +424,11 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
399424

400425
// Core interaction methods
401426
async tap(x: number, y: number): Promise<void> {
402-
await this.wdaBackend.tap(x, y);
427+
const adjusted = this.adjustCoordinates(x, y);
428+
debugDevice(
429+
`tap at coordinates - input: (${x}, ${y}), adjusted: (${adjusted.x}, ${adjusted.y}), scale: ${this.scalingRatio}`,
430+
);
431+
await this.wdaBackend.tap(adjusted.x, adjusted.y);
403432
}
404433

405434
// Android-compatible method name
@@ -409,11 +438,13 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
409438
}
410439

411440
async doubleTap(x: number, y: number): Promise<void> {
412-
await this.wdaBackend.doubleTap(x, y);
441+
const adjusted = this.adjustCoordinates(x, y);
442+
await this.wdaBackend.doubleTap(adjusted.x, adjusted.y);
413443
}
414444

415445
async longPress(x: number, y: number, duration = 1000): Promise<void> {
416-
await this.wdaBackend.longPress(x, y, duration);
446+
const adjusted = this.adjustCoordinates(x, y);
447+
await this.wdaBackend.longPress(adjusted.x, adjusted.y, duration);
417448
}
418449

419450
async swipe(
@@ -423,7 +454,15 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale})
423454
toY: number,
424455
duration = 500,
425456
): Promise<void> {
426-
await this.wdaBackend.swipe(fromX, fromY, toX, toY, duration);
457+
const adjustedFrom = this.adjustCoordinates(fromX, fromY);
458+
const adjustedTo = this.adjustCoordinates(toX, toY);
459+
await this.wdaBackend.swipe(
460+
adjustedFrom.x,
461+
adjustedFrom.y,
462+
adjustedTo.x,
463+
adjustedTo.y,
464+
duration,
465+
);
427466
}
428467

429468
async typeText(text: string, options?: IOSDeviceInputOpt): Promise<void> {

0 commit comments

Comments
 (0)