Skip to content

Commit fb97907

Browse files
committed
feat(android): remove screenshotResizeRatio option and related logic from AndroidDevice and tests
1 parent f2fab90 commit fb97907

File tree

8 files changed

+108
-146
lines changed

8 files changed

+108
-146
lines changed

packages/android/src/device.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ export type AndroidDeviceOpt = {
6060
usePhysicalDisplayIdForScreenshot?: boolean;
6161
usePhysicalDisplayIdForDisplayLookup?: boolean;
6262
customActions?: DeviceAction<any>[];
63-
screenshotResizeRatio?: number;
6463
} & AndroidDeviceInputOpt;
6564

6665
export class AndroidDevice implements AbstractInterface {

packages/android/tests/ai/setting.test.ts

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { sleep } from '@midscene/core/utils';
12
import { describe, it, vi } from 'vitest';
23
import { agentFromAdbDevice, getConnectedDevices } from '../../src';
34

@@ -11,25 +12,14 @@ describe(
1112
await it('Android settings page demo for scroll', async () => {
1213
const devices = await getConnectedDevices();
1314
const agent = await agentFromAdbDevice(devices[0].udid, {
15+
// scale: 0.5,
1416
aiActionContext:
1517
'If any location, permission, user agreement, etc. popup, click agree. If login page pops up, close it.',
1618
});
1719

1820
await agent.launch('com.android.settings/.Settings');
19-
await agent.aiAction('pull down to refresh');
20-
await agent.aiAction('long press chat list first chat');
21-
await agent.aiAction('click recent apps button');
22-
await agent.aiAction('click android home button');
23-
await agent.aiAction('scroll list to bottom');
24-
await agent.aiAction('open "More settings"');
25-
await agent.aiAction('scroll left until left edge');
26-
await agent.aiAction('scroll right until right edge');
27-
await agent.aiAction('scroll list to top');
28-
await agent.aiAction('scroll list to bottom');
29-
await agent.aiAction('scroll down one screen');
30-
await agent.aiAction('scroll up one screen');
31-
await agent.aiAction('scroll right one screen');
32-
await agent.aiAction('scroll left one screen');
21+
await sleep(2000);
22+
await agent.aiAction('点击蓝牙');
3323
});
3424
},
3525
360 * 1000,

packages/android/tests/unit-test/page.test.ts

Lines changed: 0 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -234,94 +234,6 @@ describe('AndroidDevice', () => {
234234
expect(result).toContain(mockBuffer.toString('base64'));
235235
expect(mockAdb.shell).toHaveBeenCalledWith(expect.stringMatching(/rm/));
236236
});
237-
238-
it('should apply custom resize ratio when screenshotResizeRatio is set', async () => {
239-
const customDevice = new AndroidDevice('test-device', {
240-
screenshotResizeRatio: 0.5,
241-
});
242-
243-
vi.spyOn(customDevice, 'size').mockResolvedValue({
244-
width: 1080,
245-
height: 1920,
246-
dpr: 2,
247-
});
248-
249-
const mockBuffer = Buffer.from('test-screenshot');
250-
mockAdb.takeScreenshot.mockResolvedValue(mockBuffer);
251-
vi.spyOn(customDevice, 'getAdb').mockResolvedValue(mockAdb);
252-
253-
// Mock createImgBase64ByFormat
254-
vi.spyOn(ImgUtils, 'createImgBase64ByFormat').mockReturnValue(
255-
`data:image/png;base64,${mockBuffer.toString('base64')}`,
256-
);
257-
258-
await customDevice.screenshotBase64();
259-
260-
// Verify that resizeAndConvertImgBuffer was called with half the original size
261-
expect(ImgUtils.resizeAndConvertImgBuffer).toHaveBeenCalledWith(
262-
'png',
263-
mockBuffer,
264-
{
265-
width: 540, // 1080 * 0.5
266-
height: 960, // 1920 * 0.5
267-
},
268-
);
269-
});
270-
271-
it('should use original size when screenshotResizeRatio is not set', async () => {
272-
const mockBuffer = Buffer.from('test-screenshot');
273-
mockAdb.takeScreenshot.mockResolvedValue(mockBuffer);
274-
275-
// Mock createImgBase64ByFormat
276-
vi.spyOn(ImgUtils, 'createImgBase64ByFormat').mockReturnValue(
277-
`data:image/png;base64,${mockBuffer.toString('base64')}`,
278-
);
279-
280-
await device.screenshotBase64();
281-
282-
// Verify that resizeAndConvertImgBuffer was called with original size
283-
expect(ImgUtils.resizeAndConvertImgBuffer).toHaveBeenCalledWith(
284-
'png',
285-
mockBuffer,
286-
{
287-
width: 1080, // original width
288-
height: 1920, // original height
289-
},
290-
);
291-
});
292-
293-
it('should use default ratio of 1.0 when screenshotResizeRatio is undefined', async () => {
294-
const customDevice = new AndroidDevice('test-device', {
295-
screenshotResizeRatio: undefined,
296-
});
297-
298-
vi.spyOn(customDevice, 'size').mockResolvedValue({
299-
width: 1080,
300-
height: 1920,
301-
dpr: 2,
302-
});
303-
304-
const mockBuffer = Buffer.from('test-screenshot');
305-
mockAdb.takeScreenshot.mockResolvedValue(mockBuffer);
306-
vi.spyOn(customDevice, 'getAdb').mockResolvedValue(mockAdb);
307-
308-
// Mock createImgBase64ByFormat
309-
vi.spyOn(ImgUtils, 'createImgBase64ByFormat').mockReturnValue(
310-
`data:image/png;base64,${mockBuffer.toString('base64')}`,
311-
);
312-
313-
await customDevice.screenshotBase64();
314-
315-
// Verify that resizeAndConvertImgBuffer was called with original size (ratio 1.0)
316-
expect(ImgUtils.resizeAndConvertImgBuffer).toHaveBeenCalledWith(
317-
'png',
318-
mockBuffer,
319-
{
320-
width: 1080, // 1080 * 1.0
321-
height: 1920, // 1920 * 1.0
322-
},
323-
);
324-
});
325237
});
326238

327239
describe('mouse', () => {

packages/core/src/agent/agent.ts

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ import {
5050
} from '@midscene/shared/env';
5151
import { getDebug } from '@midscene/shared/logger';
5252
import { assert } from '@midscene/shared/utils';
53-
// import type { AndroidDeviceInputOpt } from '../device';
5453
import { TaskCache } from './task-cache';
5554
import { TaskExecutor, locatePlanForLocate } from './tasks';
5655
import { locateParamStr, paramStr, taskTitleStr, typeStr } from './ui-utils';
@@ -59,6 +58,7 @@ import {
5958
getReportFileName,
6059
parsePrompt,
6160
printReportMsg,
61+
scaleElementCoordinates,
6262
} from './utils';
6363
import { trimContextByViewport } from './utils';
6464

@@ -113,6 +113,11 @@ export class Agent<
113113

114114
modelConfigManager: ModelConfigManager;
115115

116+
/**
117+
* Scale factor for screenshot processing
118+
*/
119+
private scale: number | undefined;
120+
116121
/**
117122
* Frozen page context for consistent AI operations
118123
*/
@@ -165,6 +170,8 @@ export class Agent<
165170
? new ModelConfigManager(opts.modelConfig)
166171
: globalModelConfigManager;
167172

173+
this.scale = opts?.scale;
174+
168175
this.onTaskStartTip = this.opts.onTaskStartTip;
169176

170177
this.insight = new Insight(async (action: InsightAction) => {
@@ -187,6 +194,7 @@ export class Agent<
187194
taskCache: this.taskCache,
188195
onTaskStart: this.callbackOnTaskStartTip.bind(this),
189196
replanningCycleLimit: this.opts.replanningCycleLimit,
197+
scale: this.scale,
190198
});
191199
this.dump = this.resetDump();
192200
this.reportFileName =
@@ -215,6 +223,7 @@ export class Agent<
215223
debug('Using commonContextParser for action:', action);
216224
return await commonContextParser(this.interface, {
217225
uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(),
226+
customScale: this.scale,
218227
});
219228
}
220229
}
@@ -817,10 +826,28 @@ export class Agent<
817826

818827
const { element } = output;
819828

829+
if (!element) {
830+
return {
831+
rect: undefined,
832+
center: undefined,
833+
scale: (await this.interface.size()).dpr,
834+
} as Pick<LocateResultElement, 'rect' | 'center'> & {
835+
scale: number;
836+
};
837+
}
838+
839+
// Apply coordinate scaling using shared utility function
840+
const deviceSize = await this.interface.size();
841+
const scaledElement = scaleElementCoordinates(
842+
element,
843+
this.scale,
844+
deviceSize.dpr,
845+
);
846+
820847
return {
821-
rect: element?.rect,
822-
center: element?.center,
823-
scale: (await this.interface.size()).dpr,
848+
rect: scaledElement.rect,
849+
center: scaledElement.center,
850+
scale: deviceSize.dpr,
824851
} as Pick<LocateResultElement, 'rect' | 'center'> & {
825852
scale: number;
826853
};

packages/core/src/agent/tasks.ts

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ import {
5454
matchElementFromCache,
5555
matchElementFromPlan,
5656
parsePrompt,
57+
scaleElementCoordinates,
5758
} from './utils';
5859

5960
interface ExecutionResult<OutputType = any> {
@@ -90,6 +91,8 @@ export class TaskExecutor {
9091

9192
replanningCycleLimit?: number;
9293

94+
scale?: number;
95+
9396
// @deprecated use .interface instead
9497
get page() {
9598
return this.interface;
@@ -102,11 +105,13 @@ export class TaskExecutor {
102105
taskCache?: TaskCache;
103106
onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];
104107
replanningCycleLimit?: number;
108+
scale?: number;
105109
},
106110
) {
107111
this.interface = interfaceInstance;
108112
this.insight = insight;
109113
this.taskCache = opts.taskCache;
114+
this.scale = opts.scale;
110115
this.onTaskStartCallback = opts?.onTaskStart;
111116
this.replanningCycleLimit = opts.replanningCycleLimit;
112117
this.conversationHistory = new ConversationHistory();
@@ -357,6 +362,15 @@ export class TaskExecutor {
357362
throw new Error(`Element not found: ${param.prompt}`);
358363
}
359364

365+
// Apply coordinate scaling using shared utility function
366+
// At this point, element is guaranteed to be non-null due to the check above
367+
const deviceSize = await this.interface.size();
368+
const scaledElement = scaleElementCoordinates(
369+
element!,
370+
this.scale,
371+
deviceSize.dpr,
372+
);
373+
360374
let hitBy: ExecutionTaskHitBy | undefined;
361375

362376
if (userExpectedPathHitFlag) {
@@ -391,11 +405,11 @@ export class TaskExecutor {
391405
};
392406
}
393407

394-
onResult?.(element);
408+
onResult?.(scaledElement);
395409

396410
return {
397411
output: {
398-
element,
412+
element: scaledElement,
399413
},
400414
uiContext,
401415
hitBy,

packages/core/src/agent/utils.ts

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import type {
66
ExecutionDump,
77
ExecutionTask,
88
ExecutorContext,
9+
LocateResultElement,
910
PlanningLocateParam,
1011
TMultimodalPrompt,
1112
TUserPrompt,
@@ -32,7 +33,7 @@ const debugProfile = getDebug('web:tool:profile');
3233

3334
export async function commonContextParser(
3435
interfaceInstance: AbstractInterface,
35-
_opt: { uploadServerUrl?: string },
36+
_opt: { uploadServerUrl?: string; customScale?: number },
3637
): Promise<UIContext> {
3738
assert(interfaceInstance, 'interfaceInstance is required');
3839

@@ -53,13 +54,27 @@ export async function commonContextParser(
5354
const size = await interfaceInstance.size();
5455
debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);
5556

56-
if (size.dpr && size.dpr !== 1) {
57+
// Handle DPR scaling (original logic) and custom scaling
58+
const customScale = _opt.customScale;
59+
60+
if (customScale && customScale !== 1) {
61+
// Custom scale: scale the logical size
62+
debugProfile(`Resizing screenshot with custom scale: ${customScale}`);
63+
const targetWidth = Math.round(size.width * customScale);
64+
const targetHeight = Math.round(size.height * customScale);
65+
screenshotBase64 = await resizeImgBase64(screenshotBase64, {
66+
width: targetWidth,
67+
height: targetHeight,
68+
});
69+
debugProfile('Custom scale ResizeImgBase64 end');
70+
} else if (size.dpr && size.dpr !== 1) {
71+
// DPR scaling: resize physical screenshot to logical size
5772
debugProfile('Resizing screenshot for non-1 dpr');
5873
screenshotBase64 = await resizeImgBase64(screenshotBase64, {
5974
width: size.width,
6075
height: size.height,
6176
});
62-
debugProfile('ResizeImgBase64 end');
77+
debugProfile('DPR ResizeImgBase64 end');
6378
}
6479

6580
return {
@@ -291,3 +306,39 @@ export const parsePrompt = (
291306
: undefined,
292307
};
293308
};
309+
310+
/**
311+
* Scale element coordinates from custom scaled screenshot space to device logical space
312+
*
313+
* This is only needed when a custom scale was explicitly provided. When no custom scale
314+
* is set, devices handle DPR coordinate scaling naturally.
315+
*/
316+
export function scaleElementCoordinates(
317+
element: LocateResultElement,
318+
scale: number | undefined,
319+
_deviceDpr: number | undefined,
320+
): LocateResultElement {
321+
// Only apply coordinate scaling when a custom scale was explicitly provided
322+
if (!scale || scale === 1) return element;
323+
324+
// Calculate coordinate scaling factor to convert back to logical space
325+
const coordinateScale = 1 / scale;
326+
327+
return {
328+
...element,
329+
rect: element.rect
330+
? {
331+
left: Math.round(element.rect.left * coordinateScale),
332+
top: Math.round(element.rect.top * coordinateScale),
333+
width: Math.round(element.rect.width * coordinateScale),
334+
height: Math.round(element.rect.height * coordinateScale),
335+
}
336+
: element.rect,
337+
center: element.center
338+
? ([
339+
Math.round(element.center[0] * coordinateScale),
340+
Math.round(element.center[1] * coordinateScale),
341+
] as [number, number])
342+
: element.center,
343+
};
344+
}

packages/core/src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,4 +594,6 @@ export interface AgentOpt {
594594
modelConfig?: TModelConfigFn;
595595
useCache?: boolean;
596596
replanningCycleLimit?: number;
597+
/* scale factor for screenshot processing, applied uniformly by Agent */
598+
scale?: number;
597599
}

0 commit comments

Comments
 (0)