Skip to content

Commit 275e0d1

Browse files
yuyutaotaoquanru
authored andcommitted
feat(core): calculate dpr in agent
1 parent 397fa2b commit 275e0d1

File tree

2 files changed

+73
-28
lines changed

2 files changed

+73
-28
lines changed

packages/core/src/agent/agent.ts

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ import {
4848
globalConfigManager,
4949
globalModelConfigManager,
5050
} from '@midscene/shared/env';
51-
import { resizeImgBase64 } from '@midscene/shared/img';
51+
import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img';
5252
import { getDebug } from '@midscene/shared/logger';
5353
import { assert } from '@midscene/shared/utils';
5454
// import type { AndroidDeviceInputOpt } from '../device';
@@ -136,10 +136,15 @@ export class Agent<
136136
private hasWarnedNonVLModel = false;
137137

138138
/**
139-
* Screenshot scale factor for AI model processing
139+
* Screenshot scale factor derived from actual screenshot dimensions
140140
*/
141141
private screenshotScale?: number;
142142

143+
/**
144+
* Internal promise to deduplicate screenshot scale computation
145+
*/
146+
private screenshotScalePromise?: Promise<number>;
147+
143148
// @deprecated use .interface instead
144149
get page() {
145150
return this.interface;
@@ -161,6 +166,52 @@ export class Agent<
161166
}
162167
}
163168

169+
/**
170+
* Lazily compute the ratio between the physical screenshot width and the logical page width
171+
*/
172+
private async getScreenshotScale(context: UIContext): Promise<number> {
173+
if (this.screenshotScale !== undefined) {
174+
return this.screenshotScale;
175+
}
176+
177+
if (!this.screenshotScalePromise) {
178+
this.screenshotScalePromise = (async () => {
179+
const pageWidth = context.size?.width;
180+
assert(
181+
pageWidth && pageWidth > 0,
182+
`Invalid page width when computing screenshot scale: ${pageWidth}`,
183+
);
184+
185+
const { width: screenshotWidth } = await imageInfoOfBase64(
186+
context.screenshotBase64,
187+
);
188+
189+
assert(
190+
Number.isFinite(screenshotWidth) && screenshotWidth > 0,
191+
`Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`,
192+
);
193+
194+
const computedScale = screenshotWidth / pageWidth;
195+
assert(
196+
Number.isFinite(computedScale) && computedScale > 0,
197+
`Invalid computed screenshot scale: ${computedScale}`,
198+
);
199+
200+
debug(
201+
`Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`,
202+
);
203+
return computedScale;
204+
})();
205+
}
206+
207+
try {
208+
this.screenshotScale = await this.screenshotScalePromise;
209+
return this.screenshotScale;
210+
} finally {
211+
this.screenshotScalePromise = undefined;
212+
}
213+
}
214+
164215
constructor(interfaceInstance: InterfaceType, opts?: AgentOpt) {
165216
this.interface = interfaceInstance;
166217
this.opts = Object.assign(
@@ -182,7 +233,6 @@ export class Agent<
182233
? new ModelConfigManager(opts.modelConfig)
183234
: globalModelConfigManager;
184235

185-
this.screenshotScale = opts?.screenshotScale;
186236
this.onTaskStartTip = this.opts.onTaskStartTip;
187237

188238
this.insight = new Insight(async (action: InsightAction) => {
@@ -237,33 +287,22 @@ export class Agent<
237287
});
238288
}
239289

240-
// Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR
241-
let targetWidth = context.size.width;
242-
let targetHeight = context.size.height;
243-
let needResize = false;
244-
245-
if (this.screenshotScale && this.screenshotScale !== 1) {
246-
// User-specified scaling ratio
247-
debug(`Applying user screenshot scale: ${this.screenshotScale}`);
248-
targetWidth = Math.round(context.size.width * this.screenshotScale);
249-
targetHeight = Math.round(context.size.height * this.screenshotScale);
250-
needResize = true;
251-
} else if (context.size.dpr && context.size.dpr !== 1) {
252-
// No user-specified scaling, use DPR scaling to logical size
290+
const computedScreenshotScale = await this.getScreenshotScale(context);
291+
292+
if (computedScreenshotScale !== 1) {
293+
const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4));
253294
debug(
254-
`Applying DPR scaling: ${context.size.dpr} (resize to logical size)`,
295+
`Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`,
255296
);
256-
// Target is logical size, no need to change targetWidth/targetHeight
257-
needResize = true;
258-
}
259-
260-
// Execute scaling
261-
if (needResize) {
297+
const targetWidth = Math.round(context.size.width);
298+
const targetHeight = Math.round(context.size.height);
262299
debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`);
263300
context.screenshotBase64 = await resizeImgBase64(
264301
context.screenshotBase64,
265302
{ width: targetWidth, height: targetHeight },
266303
);
304+
} else {
305+
debug(`screenshot scale=${computedScreenshotScale}`);
267306
}
268307

269308
return context;
@@ -867,12 +906,18 @@ export class Agent<
867906

868907
const { element } = output;
869908

909+
const dprValue = await (this.interface.size() as any).dpr;
910+
const dprEntry = dprValue
911+
? {
912+
dpr: dprValue,
913+
}
914+
: {};
870915
return {
871916
rect: element?.rect,
872917
center: element?.center,
873-
dpr: (await this.interface.size()).dpr,
918+
...dprEntry,
874919
} as Pick<LocateResultElement, 'rect' | 'center'> & {
875-
dpr: number;
920+
dpr?: number; // this field is deprecated
876921
};
877922
}
878923

packages/shared/src/types/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ export interface Point {
77
}
88

99
export interface Size {
10-
width: number; // logical pixel size
11-
height: number;
12-
dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model.
10+
width: number; // The image sent to AI model will be resized to this width. usually you should set it to the logical pixel size
11+
height: number; // The image sent to AI model will be resized to this height. usually you should set it to the logical pixel size
12+
dpr?: number; // this is deprecated, do NOT use it
1313
}
1414

1515
export type Rect = Point & Size & { zoom?: number };

0 commit comments

Comments
 (0)