Skip to content

Commit a201d73

Browse files
committed
feat(core): add screenshot scaling support for improved AI processing
1 parent cbe21cb commit a201d73

File tree

6 files changed

+60
-68
lines changed

6 files changed

+60
-68
lines changed

packages/android/src/device.ts

Lines changed: 10 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -713,28 +713,15 @@ ${Object.keys(size)
713713
const width = Number.parseInt(match[isLandscape ? 2 : 1], 10);
714714
const height = Number.parseInt(match[isLandscape ? 1 : 2], 10);
715715

716-
// Use cached device pixel ratio instead of calling getDisplayDensity() every time
717-
718-
// Convert physical pixels to logical pixels for consistent coordinate system
719-
// adjustCoordinates() will convert back to physical pixels when needed for touch operations
720-
const logicalWidth = Math.round(width / this.devicePixelRatio);
721-
const logicalHeight = Math.round(height / this.devicePixelRatio);
722-
716+
// Return physical pixels to match screenshot dimensions
717+
// This ensures AI coordinate conversion uses the same dimensions as the screenshot
723718
return {
724-
width: logicalWidth,
725-
height: logicalHeight,
719+
width,
720+
height,
726721
dpr: this.devicePixelRatio,
727722
};
728723
}
729724

730-
private adjustCoordinates(x: number, y: number): { x: number; y: number } {
731-
const ratio = this.devicePixelRatio;
732-
return {
733-
x: Math.round(x * ratio),
734-
y: Math.round(y * ratio),
735-
};
736-
}
737-
738725
/**
739726
* Calculate the end point for scroll operations based on start point, scroll delta, and screen boundaries.
740727
* This method ensures that scroll operations stay within screen bounds and maintain a minimum scroll distance
@@ -1183,20 +1170,17 @@ ${Object.keys(size)
11831170
async mouseClick(x: number, y: number): Promise<void> {
11841171
const adb = await this.getAdb();
11851172

1186-
// Use adjusted coordinates
1187-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
11881173
await adb.shell(
1189-
`input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} 150`,
1174+
`input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} 150`,
11901175
);
11911176
}
11921177

11931178
async mouseDoubleClick(x: number, y: number): Promise<void> {
11941179
const adb = await this.getAdb();
1195-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
11961180

11971181
// Use input tap for double-click as it generates proper touch events
11981182
// that Android can recognize as a double-click gesture
1199-
const tapCommand = `input${this.getDisplayArg()} tap ${adjustedX} ${adjustedY}`;
1183+
const tapCommand = `input${this.getDisplayArg()} tap ${x} ${y}`;
12001184
await adb.shell(tapCommand);
12011185
// Short delay between taps for double-click recognition
12021186
await sleep(50);
@@ -1216,15 +1200,11 @@ ${Object.keys(size)
12161200
): Promise<void> {
12171201
const adb = await this.getAdb();
12181202

1219-
// Use adjusted coordinates
1220-
const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y);
1221-
const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y);
1222-
12231203
// Ensure duration has a default value
12241204
const swipeDuration = duration ?? defaultNormalScrollDuration;
12251205

12261206
await adb.shell(
1227-
`input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${swipeDuration}`,
1207+
`input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${swipeDuration}`,
12281208
);
12291209
}
12301210

@@ -1264,22 +1244,12 @@ ${Object.keys(size)
12641244
const endX = startX - deltaX;
12651245
const endY = startY - deltaY;
12661246

1267-
// Adjust coordinates to fit device ratio
1268-
const { x: adjustedStartX, y: adjustedStartY } = this.adjustCoordinates(
1269-
startX,
1270-
startY,
1271-
);
1272-
const { x: adjustedEndX, y: adjustedEndY } = this.adjustCoordinates(
1273-
endX,
1274-
endY,
1275-
);
1276-
12771247
const adb = await this.getAdb();
12781248
const swipeDuration = duration ?? defaultNormalScrollDuration;
12791249

12801250
// Execute the swipe operation
12811251
await adb.shell(
1282-
`input${this.getDisplayArg()} swipe ${adjustedStartX} ${adjustedStartY} ${adjustedEndX} ${adjustedEndY} ${swipeDuration}`,
1252+
`input${this.getDisplayArg()} swipe ${startX} ${startY} ${endX} ${endY} ${swipeDuration}`,
12831253
);
12841254
}
12851255

@@ -1320,10 +1290,8 @@ ${Object.keys(size)
13201290
async longPress(x: number, y: number, duration = 1000): Promise<void> {
13211291
const adb = await this.getAdb();
13221292

1323-
// Use adjusted coordinates
1324-
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
13251293
await adb.shell(
1326-
`input${this.getDisplayArg()} swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} ${duration}`,
1294+
`input${this.getDisplayArg()} swipe ${x} ${y} ${x} ${y} ${duration}`,
13271295
);
13281296
}
13291297

@@ -1355,13 +1323,9 @@ ${Object.keys(size)
13551323
): Promise<void> {
13561324
const adb = await this.getAdb();
13571325

1358-
// Use adjusted coordinates
1359-
const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y);
1360-
const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y);
1361-
13621326
// Use the specified duration for better pull gesture recognition
13631327
await adb.shell(
1364-
`input${this.getDisplayArg()} swipe ${fromX} ${fromY} ${toX} ${toY} ${duration}`,
1328+
`input${this.getDisplayArg()} swipe ${from.x} ${from.y} ${to.x} ${to.y} ${duration}`,
13651329
);
13661330
}
13671331

packages/core/src/agent/agent.ts

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import {
4848
globalConfigManager,
4949
globalModelConfigManager,
5050
} from '@midscene/shared/env';
51+
import { resizeImgBase64 } from '@midscene/shared/img';
5152
import { getDebug } from '@midscene/shared/logger';
5253
import { assert } from '@midscene/shared/utils';
5354
// import type { AndroidDeviceInputOpt } from '../device';
@@ -123,6 +124,11 @@ export class Agent<
123124
*/
124125
private hasWarnedNonVLModel = false;
125126

127+
/**
128+
* Screenshot scale factor for AI model processing
129+
*/
130+
private screenshotScale?: number;
131+
126132
// @deprecated use .interface instead
127133
get page() {
128134
return this.interface;
@@ -165,6 +171,7 @@ export class Agent<
165171
? new ModelConfigManager(opts.modelConfig)
166172
: globalModelConfigManager;
167173

174+
this.screenshotScale = opts?.screenshotScale;
168175
this.onTaskStartTip = this.opts.onTaskStartTip;
169176

170177
this.insight = new Insight(async (action: InsightAction) => {
@@ -207,15 +214,48 @@ export class Agent<
207214
return this.frozenUIContext;
208215
}
209216

217+
// Get original context
218+
let context: UIContext;
210219
if (this.interface.getContext) {
211220
debug('Using page.getContext for action:', action);
212-
return await this.interface.getContext();
221+
context = await this.interface.getContext();
213222
} else {
214223
debug('Using commonContextParser for action:', action);
215-
return await commonContextParser(this.interface, {
224+
context = await commonContextParser(this.interface, {
216225
uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(),
217226
});
218227
}
228+
229+
// Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR
230+
let targetWidth = context.size.width;
231+
let targetHeight = context.size.height;
232+
let needResize = false;
233+
234+
if (this.screenshotScale && this.screenshotScale !== 1) {
235+
// User-specified scaling ratio
236+
debug(`Applying user screenshot scale: ${this.screenshotScale}`);
237+
targetWidth = Math.round(context.size.width * this.screenshotScale);
238+
targetHeight = Math.round(context.size.height * this.screenshotScale);
239+
needResize = true;
240+
} else if (context.size.dpr && context.size.dpr !== 1) {
241+
// No user-specified scaling, use DPR scaling to logical size
242+
debug(
243+
`Applying DPR scaling: ${context.size.dpr} (resize to logical size)`,
244+
);
245+
// Target is logical size, no need to change targetWidth/targetHeight
246+
needResize = true;
247+
}
248+
249+
// Execute scaling
250+
if (needResize) {
251+
debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`);
252+
context.screenshotBase64 = await resizeImgBase64(
253+
context.screenshotBase64,
254+
{ width: targetWidth, height: targetHeight },
255+
);
256+
}
257+
258+
return context;
219259
}
220260

221261
async _snapshotContext(): Promise<UIContext> {
@@ -819,9 +859,9 @@ export class Agent<
819859
return {
820860
rect: element?.rect,
821861
center: element?.center,
822-
scale: (await this.interface.size()).dpr,
862+
dpr: (await this.interface.size()).dpr,
823863
} as Pick<LocateResultElement, 'rect' | 'center'> & {
824-
scale: number;
864+
dpr: number;
825865
};
826866
}
827867

packages/core/src/agent/utils.ts

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,12 @@ export async function commonContextParser(
4747
});
4848
debugProfile('UploadTestInfoToServer end');
4949

50-
let screenshotBase64 = await interfaceInstance.screenshotBase64();
50+
const screenshotBase64 = await interfaceInstance.screenshotBase64();
5151
assert(screenshotBase64!, 'screenshotBase64 is required');
5252

5353
const size = await interfaceInstance.size();
5454
debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);
5555

56-
if (size.dpr && size.dpr !== 1) {
57-
debugProfile('Resizing screenshot for non-1 dpr');
58-
screenshotBase64 = await resizeImgBase64(screenshotBase64, {
59-
width: size.width,
60-
height: size.height,
61-
});
62-
debugProfile('ResizeImgBase64 end');
63-
}
64-
6556
return {
6657
tree: {
6758
node: null,

packages/core/src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,4 +604,6 @@ export interface AgentOpt {
604604
modelConfig?: TModelConfigFn;
605605
cache?: Cache;
606606
replanningCycleLimit?: number;
607+
/* Screenshot scaling ratio to reduce image size sent to AI for better performance */
608+
screenshotScale?: number;
607609
}

packages/shared/src/img/info.ts

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,9 @@
11
import assert from 'node:assert';
22
import { Buffer } from 'node:buffer';
33
import type Jimp from 'jimp';
4+
import type { Size } from '../types';
45
import getJimp from './get-jimp';
56

6-
export interface Size {
7-
width: number;
8-
height: number;
9-
dpr?: number;
10-
}
11-
127
export interface ImageInfo extends Size {
138
jimpImage: Jimp;
149
}

packages/shared/src/types/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ export interface Point {
77
}
88

99
export interface Size {
10-
width: number; // device independent window size
10+
width: number; // logical pixel size
1111
height: number;
12-
dpr?: number; // the scale factor of the screenshots
12+
dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model.
1313
}
1414

1515
export type Rect = Point & Size & { zoom?: number };

0 commit comments

Comments
 (0)