Skip to content

Commit 397fa2b

Browse files
committed
feat(core): add screenshot scaling support for improved AI processing
1 parent f99c220 commit 397fa2b

File tree

6 files changed

+60
-68
lines changed

6 files changed

+60
-68
lines changed

packages/android/src/device.ts

Lines changed: 10 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -713,28 +713,15 @@ ${Object.keys(size)
713713
const width = Number.parseInt(match[isLandscape ? 2 : 1], 10);
714714
const height = Number.parseInt(match[isLandscape ? 1 : 2], 10);
715715

716-
// Use cached device pixel ratio instead of calling getDisplayDensity() every time
717-
718-
// Convert physical pixels to logical pixels for consistent coordinate system
719-
// adjustCoordinates() will convert back to physical pixels when needed for touch operations
720-
const logicalWidth = Math.round(width / this.devicePixelRatio);
721-
const logicalHeight = Math.round(height / this.devicePixelRatio);
722-
716+
// Return physical pixels to match screenshot dimensions
717+
// This ensures AI coordinate conversion uses the same dimensions as the screenshot
723718
return {
724-
width: logicalWidth,
725-
height: logicalHeight,
719+
width,
720+
height,
726721
dpr: this.devicePixelRatio,
727722
};
728723
}
729724

730-
private adjustCoordinates(x: number, y: number): { x: number; y: number } {
731-
const ratio = this.devicePixelRatio;
732-
return {
733-
x: Math.round(x * ratio),
734-
y: Math.round(y * ratio),
735-
};
736-
}
737-
738725
/**
739726
* Calculate the end point for scroll operations based on start point, scroll delta, and screen boundaries.
740727
* This method ensures that scroll operations stay within screen bounds and maintain a minimum scroll distance
@@ -1183,20 +1170,17 @@ ${Object.keys(size)
11831170
async mouseClick(x: number, y: number): Promise<void> {
11841171
const adb = await this.getAdb();
11851172

1186-
// Use adjusted coordinates
1187-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
11881173
await adb.shell(
1189-
`input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} 150`,
1174+
`input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} 150`,
11901175
);
11911176
}
11921177

11931178
async mouseDoubleClick(x: number, y: number): Promise<void> {
11941179
const adb = await this.getAdb();
1195-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
11961180

11971181
// Use input tap for double-click as it generates proper touch events
11981182
// that Android can recognize as a double-click gesture
1199-
const tapCommand = `input${this.getDisplayArg()} tap ${adjustedX} ${adjustedY}`;
1183+
const tapCommand = `input${this.getDisplayArg()} tap ${x} ${y}`;
12001184
await adb.shell(tapCommand);
12011185
// Short delay between taps for double-click recognition
12021186
await sleep(50);
@@ -1216,15 +1200,11 @@ ${Object.keys(size)
12161200
): Promise<void> {
12171201
const adb = await this.getAdb();
12181202

1219-
// Use adjusted coordinates
1220-
const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y);
1221-
const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y);
1222-
12231203
// Ensure duration has a default value
12241204
const swipeDuration = duration ?? defaultNormalScrollDuration;
12251205

12261206
await adb.shell(
1227-
`input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${swipeDuration}`,
1207+
`input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${swipeDuration}`,
12281208
);
12291209
}
12301210

@@ -1264,22 +1244,12 @@ ${Object.keys(size)
12641244
const endX = startX - deltaX;
12651245
const endY = startY - deltaY;
12661246

1267-
// Adjust coordinates to fit device ratio
1268-
const { x: adjustedStartX, y: adjustedStartY } = this.adjustCoordinates(
1269-
startX,
1270-
startY,
1271-
);
1272-
const { x: adjustedEndX, y: adjustedEndY } = this.adjustCoordinates(
1273-
endX,
1274-
endY,
1275-
);
1276-
12771247
const adb = await this.getAdb();
12781248
const swipeDuration = duration ?? defaultNormalScrollDuration;
12791249

12801250
// Execute the swipe operation
12811251
await adb.shell(
1282-
`input${this.getDisplayArg()} swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${swipeDuration}`,
1252+
`input${this.getDisplayArg()} swipe ${startX} ${startY} ${endX} ${endY} ${swipeDuration}`,
12831253
);
12841254
}
12851255

@@ -1320,10 +1290,8 @@ ${Object.keys(size)
13201290
async longPress(x: number, y: number, duration = 1000): Promise<void> {
13211291
const adb = await this.getAdb();
13221292

1323-
// Use adjusted coordinates
1324-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
13251293
await adb.shell(
1326-
`input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} ${duration}`,
1294+
`input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} ${duration}`,
13271295
);
13281296
}
13291297

@@ -1355,13 +1323,9 @@ ${Object.keys(size)
13551323
): Promise<void> {
13561324
const adb = await this.getAdb();
13571325

1358-
// Use adjusted coordinates
1359-
const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y);
1360-
const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y);
1361-
13621326
// Use the specified duration for better pull gesture recognition
13631327
await adb.shell(
1364-
`input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${duration}`,
1328+
`input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${duration}`,
13651329
);
13661330
}
13671331

packages/core/src/agent/agent.ts

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import {
4848
globalConfigManager,
4949
globalModelConfigManager,
5050
} from '@midscene/shared/env';
51+
import { resizeImgBase64 } from '@midscene/shared/img';
5152
import { getDebug } from '@midscene/shared/logger';
5253
import { assert } from '@midscene/shared/utils';
5354
// import type { AndroidDeviceInputOpt } from '../device';
@@ -134,6 +135,11 @@ export class Agent<
134135
*/
135136
private hasWarnedNonVLModel = false;
136137

138+
/**
139+
* Screenshot scale factor for AI model processing
140+
*/
141+
private screenshotScale?: number;
142+
137143
// @deprecated use .interface instead
138144
get page() {
139145
return this.interface;
@@ -176,6 +182,7 @@ export class Agent<
176182
? new ModelConfigManager(opts.modelConfig)
177183
: globalModelConfigManager;
178184

185+
this.screenshotScale = opts?.screenshotScale;
179186
this.onTaskStartTip = this.opts.onTaskStartTip;
180187

181188
this.insight = new Insight(async (action: InsightAction) => {
@@ -218,15 +225,48 @@ export class Agent<
218225
return this.frozenUIContext;
219226
}
220227

228+
// Get original context
229+
let context: UIContext;
221230
if (this.interface.getContext) {
222231
debug('Using page.getContext for action:', action);
223-
return await this.interface.getContext();
232+
context = await this.interface.getContext();
224233
} else {
225234
debug('Using commonContextParser for action:', action);
226-
return await commonContextParser(this.interface, {
235+
context = await commonContextParser(this.interface, {
227236
uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(),
228237
});
229238
}
239+
240+
// Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR
241+
let targetWidth = context.size.width;
242+
let targetHeight = context.size.height;
243+
let needResize = false;
244+
245+
if (this.screenshotScale && this.screenshotScale !== 1) {
246+
// User-specified scaling ratio
247+
debug(`Applying user screenshot scale: ${this.screenshotScale}`);
248+
targetWidth = Math.round(context.size.width * this.screenshotScale);
249+
targetHeight = Math.round(context.size.height * this.screenshotScale);
250+
needResize = true;
251+
} else if (context.size.dpr && context.size.dpr !== 1) {
252+
// No user-specified scaling, use DPR scaling to logical size
253+
debug(
254+
`Applying DPR scaling: ${context.size.dpr} (resize to logical size)`,
255+
);
256+
// Target is logical size, no need to change targetWidth/targetHeight
257+
needResize = true;
258+
}
259+
260+
// Execute scaling
261+
if (needResize) {
262+
debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`);
263+
context.screenshotBase64 = await resizeImgBase64(
264+
context.screenshotBase64,
265+
{ width: targetWidth, height: targetHeight },
266+
);
267+
}
268+
269+
return context;
230270
}
231271

232272
async _snapshotContext(): Promise<UIContext> {
@@ -830,9 +870,9 @@ export class Agent<
830870
return {
831871
rect: element?.rect,
832872
center: element?.center,
833-
scale: (await this.interface.size()).dpr,
873+
dpr: (await this.interface.size()).dpr,
834874
} as Pick<LocateResultElement, 'rect' | 'center'> & {
835-
scale: number;
875+
dpr: number;
836876
};
837877
}
838878

packages/core/src/agent/utils.ts

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,12 @@ export async function commonContextParser(
5050
});
5151
debugProfile('UploadTestInfoToServer end');
5252

53-
let screenshotBase64 = await interfaceInstance.screenshotBase64();
53+
const screenshotBase64 = await interfaceInstance.screenshotBase64();
5454
assert(screenshotBase64!, 'screenshotBase64 is required');
5555

5656
const size = await interfaceInstance.size();
5757
debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);
5858

59-
if (size.dpr && size.dpr !== 1) {
60-
debugProfile('Resizing screenshot for non-1 dpr');
61-
screenshotBase64 = await resizeImgBase64(screenshotBase64, {
62-
width: size.width,
63-
height: size.height,
64-
});
65-
debugProfile('ResizeImgBase64 end');
66-
}
67-
6859
return {
6960
tree: {
7061
node: null,

packages/core/src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,4 +604,6 @@ export interface AgentOpt {
604604
modelConfig?: TModelConfigFn;
605605
cache?: Cache;
606606
replanningCycleLimit?: number;
607+
/* Screenshot scaling ratio to reduce image size sent to AI for better performance */
608+
screenshotScale?: number;
607609
}

packages/shared/src/img/info.ts

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import assert from 'node:assert';
22
import { Buffer } from 'node:buffer';
33
import type Jimp from 'jimp';
4+
import type { Size } from '../types';
45
import getJimp from './get-jimp';
56

6-
export interface Size {
7-
width: number;
8-
height: number;
9-
dpr?: number;
10-
}
11-
127
export interface ImageInfo extends Size {
138
jimpImage: Jimp;
149
}

packages/shared/src/types/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ export interface Point {
77
}
88

99
export interface Size {
10-
width: number; // device independent window size
10+
width: number; // logical pixel size
1111
height: number;
12-
dpr?: number; // the scale factor of the screenshots
12+
dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model.
1313
}
1414

1515
export type Rect = Point & Size & { zoom?: number };

0 commit comments

Comments
 (0)