Skip to content

Commit 7337034

Browse files
committed
feat(core): calculate dpr in agent
1 parent a201d73 commit 7337034

File tree

2 files changed

+73
-28
lines changed

2 files changed

+73
-28
lines changed

packages/core/src/agent/agent.ts

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ import {
4848
globalConfigManager,
4949
globalModelConfigManager,
5050
} from '@midscene/shared/env';
51-
import { resizeImgBase64 } from '@midscene/shared/img';
51+
import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img';
5252
import { getDebug } from '@midscene/shared/logger';
5353
import { assert } from '@midscene/shared/utils';
5454
// import type { AndroidDeviceInputOpt } from '../device';
@@ -125,10 +125,15 @@ export class Agent<
125125
private hasWarnedNonVLModel = false;
126126

127127
/**
128-
* Screenshot scale factor for AI model processing
128+
* Screenshot scale factor derived from actual screenshot dimensions
129129
*/
130130
private screenshotScale?: number;
131131

132+
/**
133+
* Internal promise to deduplicate screenshot scale computation
134+
*/
135+
private screenshotScalePromise?: Promise<number>;
136+
132137
// @deprecated use .interface instead
133138
get page() {
134139
return this.interface;
@@ -150,6 +155,52 @@ export class Agent<
150155
}
151156
}
152157

158+
/**
159+
* Lazily compute the ratio between the physical screenshot width and the logical page width
160+
*/
161+
private async getScreenshotScale(context: UIContext): Promise<number> {
162+
if (this.screenshotScale !== undefined) {
163+
return this.screenshotScale;
164+
}
165+
166+
if (!this.screenshotScalePromise) {
167+
this.screenshotScalePromise = (async () => {
168+
const pageWidth = context.size?.width;
169+
assert(
170+
pageWidth && pageWidth > 0,
171+
`Invalid page width when computing screenshot scale: ${pageWidth}`,
172+
);
173+
174+
const { width: screenshotWidth } = await imageInfoOfBase64(
175+
context.screenshotBase64,
176+
);
177+
178+
assert(
179+
Number.isFinite(screenshotWidth) && screenshotWidth > 0,
180+
`Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`,
181+
);
182+
183+
const computedScale = screenshotWidth / pageWidth;
184+
assert(
185+
Number.isFinite(computedScale) && computedScale > 0,
186+
`Invalid computed screenshot scale: ${computedScale}`,
187+
);
188+
189+
debug(
190+
`Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`,
191+
);
192+
return computedScale;
193+
})();
194+
}
195+
196+
try {
197+
this.screenshotScale = await this.screenshotScalePromise;
198+
return this.screenshotScale;
199+
} finally {
200+
this.screenshotScalePromise = undefined;
201+
}
202+
}
203+
153204
constructor(interfaceInstance: InterfaceType, opts?: AgentOpt) {
154205
this.interface = interfaceInstance;
155206
this.opts = Object.assign(
@@ -171,7 +222,6 @@ export class Agent<
171222
? new ModelConfigManager(opts.modelConfig)
172223
: globalModelConfigManager;
173224

174-
this.screenshotScale = opts?.screenshotScale;
175225
this.onTaskStartTip = this.opts.onTaskStartTip;
176226

177227
this.insight = new Insight(async (action: InsightAction) => {
@@ -226,33 +276,22 @@ export class Agent<
226276
});
227277
}
228278

229-
// Unified screenshot scaling: prioritize screenshotScale, otherwise use DPR
230-
let targetWidth = context.size.width;
231-
let targetHeight = context.size.height;
232-
let needResize = false;
233-
234-
if (this.screenshotScale && this.screenshotScale !== 1) {
235-
// User-specified scaling ratio
236-
debug(`Applying user screenshot scale: ${this.screenshotScale}`);
237-
targetWidth = Math.round(context.size.width * this.screenshotScale);
238-
targetHeight = Math.round(context.size.height * this.screenshotScale);
239-
needResize = true;
240-
} else if (context.size.dpr && context.size.dpr !== 1) {
241-
// No user-specified scaling, use DPR scaling to logical size
279+
const computedScreenshotScale = await this.getScreenshotScale(context);
280+
281+
if (computedScreenshotScale !== 1) {
282+
const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4));
242283
debug(
243-
`Applying DPR scaling: ${context.size.dpr} (resize to logical size)`,
284+
`Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`,
244285
);
245-
// Target is logical size, no need to change targetWidth/targetHeight
246-
needResize = true;
247-
}
248-
249-
// Execute scaling
250-
if (needResize) {
286+
const targetWidth = Math.round(context.size.width);
287+
const targetHeight = Math.round(context.size.height);
251288
debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`);
252289
context.screenshotBase64 = await resizeImgBase64(
253290
context.screenshotBase64,
254291
{ width: targetWidth, height: targetHeight },
255292
);
293+
} else {
294+
debug(`screenshot scale=${computedScreenshotScale}`);
256295
}
257296

258297
return context;
@@ -856,12 +895,18 @@ export class Agent<
856895

857896
const { element } = output;
858897

898+
const dprValue = await (this.interface.size() as any).dpr;
899+
const dprEntry = dprValue
900+
? {
901+
dpr: dprValue,
902+
}
903+
: {};
859904
return {
860905
rect: element?.rect,
861906
center: element?.center,
862-
dpr: (await this.interface.size()).dpr,
907+
...dprEntry,
863908
} as Pick<LocateResultElement, 'rect' | 'center'> & {
864-
dpr: number;
909+
dpr?: number; // this field is deprecated
865910
};
866911
}
867912

packages/shared/src/types/index.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ export interface Point {
77
}
88

99
export interface Size {
10-
width: number; // logical pixel size
11-
height: number;
12-
dpr?: number; // dpr is the ratio of the physical pixel to the logical pixel. For example, the dpr is 2 when the screenshotBase64 returned is 2000x1000 when the logical width and height are 1000x500 here. Overriding the dpr will affect how the screenshotBase64 is resized before being sent to the AI model.
10+
width: number; // The image sent to AI model will be resized to this width. usually you should set it to the logical pixel size
11+
height: number; // The image sent to AI model will be resized to this height. usually you should set it to the logical pixel size
12+
dpr?: number; // this is deprecated, do NOT use it
1313
}
1414

1515
export type Rect = Point & Size & { zoom?: number };

0 commit comments

Comments
 (0)