Skip to content

Commit e50422e

Browse files
authored
refactor(core): move xpath cache logic into web-integration (#1263)
* refactor(core): move xpath cache logic into web * fix(core): ci * chore(core): fix lint * fix(chore): fix cache key in cache file * chore(core): fix lint
1 parent 6741adc commit e50422e

File tree

11 files changed

+290
-148
lines changed

11 files changed

+290
-148
lines changed

packages/core/src/agent/task-cache.ts

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ import assert from 'node:assert';
22
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
33
import { dirname, join } from 'node:path';
44
import { isDeepStrictEqual } from 'node:util';
5-
import type { TUserPrompt } from '@/index';
5+
import type { TUserPrompt } from '@/ai-model';
6+
import type { ElementCacheFeature } from '@/types';
67
import { getMidsceneRunSubDir } from '@midscene/shared/common';
78
import {
89
MIDSCENE_CACHE_MAX_FILENAME_LENGTH,
@@ -29,7 +30,9 @@ export interface PlanningCache {
2930
export interface LocateCache {
3031
type: 'locate';
3132
prompt: TUserPrompt;
32-
xpaths: string[];
33+
cache?: ElementCacheFeature;
34+
/** @deprecated kept for backward compatibility */
35+
xpaths?: string[];
3336
}
3437

3538
export interface MatchCacheResult<T extends PlanningCache | LocateCache> {
@@ -117,6 +120,15 @@ export class TaskCache {
117120
isDeepStrictEqual(item.prompt, prompt) &&
118121
!this.matchedCacheIndices.has(key)
119122
) {
123+
if (item.type === 'locate') {
124+
const locateItem = item as LocateCache;
125+
if (!locateItem.cache && Array.isArray(locateItem.xpaths)) {
126+
locateItem.cache = { xpaths: locateItem.xpaths };
127+
}
128+
if ('xpaths' in locateItem) {
129+
locateItem.xpaths = undefined;
130+
}
131+
}
120132
this.matchedCacheIndices.add(key);
121133
debug(
122134
'cache found and marked as used, type: %s, prompt: %s, index: %d',
@@ -294,7 +306,11 @@ export class TaskCache {
294306
});
295307
} else {
296308
cachedRecord.updateFn((cache) => {
297-
(cache as LocateCache).xpaths = newRecord.xpaths;
309+
const locateCache = cache as LocateCache;
310+
locateCache.cache = newRecord.cache;
311+
if ('xpaths' in locateCache) {
312+
locateCache.xpaths = undefined;
313+
}
298314
});
299315
}
300316
} else {

packages/core/src/agent/tasks.ts

Lines changed: 39 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import {
22
ConversationHistory,
3-
elementByPositionWithElementInfo,
43
findAllMidsceneLocatorField,
54
uiTarsPlanning,
65
} from '@/ai-model';
@@ -10,6 +9,7 @@ import {
109
type BaseElement,
1110
type DetailedLocateParam,
1211
type DumpSubscriber,
12+
type ElementCacheFeature,
1313
type ExecutionRecorderItem,
1414
type ExecutionTaskActionApply,
1515
type ExecutionTaskApply,
@@ -40,7 +40,6 @@ import {
4040
plan,
4141
} from '@/index';
4242
import { sleep } from '@/utils';
43-
import { NodeType } from '@midscene/shared/constants';
4443
import {
4544
type IModelConfig,
4645
MIDSCENE_REPLANNING_CYCLE_LIMIT,
@@ -123,68 +122,6 @@ export class TaskExecutor {
123122
return item;
124123
}
125124

126-
private async getElementXpath(
127-
uiContext: UIContext<BaseElement>,
128-
element: LocateResultElement,
129-
): Promise<string[] | undefined> {
130-
if (!(this.interface as any).getXpathsByPoint) {
131-
debug('getXpathsByPoint is not supported for this interface');
132-
return undefined;
133-
}
134-
135-
let elementId = element?.id;
136-
if (element?.isOrderSensitive !== undefined) {
137-
try {
138-
const xpaths = await (this.interface as any).getXpathsByPoint(
139-
{
140-
left: element.center[0],
141-
top: element.center[1],
142-
},
143-
element?.isOrderSensitive,
144-
);
145-
146-
return xpaths;
147-
} catch (error) {
148-
debug('getXpathsByPoint failed: %s', error);
149-
return undefined;
150-
}
151-
}
152-
153-
// find the nearest xpath for the element
154-
if (element?.attributes?.nodeType === NodeType.POSITION) {
155-
await this.insight.contextRetrieverFn('locate');
156-
const info = elementByPositionWithElementInfo(
157-
uiContext.tree,
158-
{
159-
x: element.center[0],
160-
y: element.center[1],
161-
},
162-
{
163-
requireStrictDistance: false,
164-
filterPositionElements: true,
165-
},
166-
);
167-
if (info?.id) {
168-
elementId = info.id;
169-
} else {
170-
debug(
171-
'no element id found for position node, will not update cache',
172-
element,
173-
);
174-
}
175-
}
176-
177-
if (!elementId) {
178-
return undefined;
179-
}
180-
try {
181-
const result = await (this.interface as any).getXpathsById(elementId);
182-
return result;
183-
} catch (error) {
184-
debug('getXpathsById error: ', error);
185-
}
186-
}
187-
188125
private prependExecutorWithScreenshot(
189126
taskApply: ExecutionTaskApply,
190127
appendAfterExecution = false,
@@ -283,12 +220,12 @@ export class TaskExecutor {
283220
const cachePrompt = param.prompt;
284221
const locateCacheRecord =
285222
this.taskCache?.matchLocateCache(cachePrompt);
286-
const xpaths = locateCacheRecord?.cacheContent?.xpaths;
223+
const cacheEntry = locateCacheRecord?.cacheContent?.cache;
287224
const elementFromCache = userExpectedPathHitFlag
288225
? null
289226
: await matchElementFromCache(
290227
this,
291-
xpaths,
228+
cacheEntry,
292229
cachePrompt,
293230
param.cacheable,
294231
);
@@ -324,38 +261,47 @@ export class TaskExecutor {
324261
elementFromAiLocate;
325262

326263
// update cache
327-
let currentXpaths: string[] | undefined;
264+
let currentCacheEntry: ElementCacheFeature | undefined;
328265
if (
329266
element &&
330267
this.taskCache &&
331268
!cacheHitFlag &&
332269
param?.cacheable !== false
333270
) {
334-
const elementXpaths = await this.getElementXpath(
335-
uiContext,
336-
element,
337-
);
338-
if (elementXpaths?.length) {
339-
debug(
340-
'update cache, prompt: %s, xpaths: %s',
341-
cachePrompt,
342-
elementXpaths,
343-
);
344-
currentXpaths = elementXpaths;
345-
this.taskCache.updateOrAppendCacheRecord(
346-
{
347-
type: 'locate',
348-
prompt: cachePrompt,
349-
xpaths: elementXpaths,
350-
},
351-
locateCacheRecord,
352-
);
271+
if (this.interface.cacheFeatureForRect) {
272+
try {
273+
const feature = await this.interface.cacheFeatureForRect(
274+
element.rect,
275+
element.isOrderSensitive !== undefined
276+
? { _orderSensitive: element.isOrderSensitive }
277+
: undefined,
278+
);
279+
if (feature && Object.keys(feature).length > 0) {
280+
debug(
281+
'update cache, prompt: %s, cache: %o',
282+
cachePrompt,
283+
feature,
284+
);
285+
currentCacheEntry = feature;
286+
this.taskCache.updateOrAppendCacheRecord(
287+
{
288+
type: 'locate',
289+
prompt: cachePrompt,
290+
cache: feature,
291+
},
292+
locateCacheRecord,
293+
);
294+
} else {
295+
debug(
296+
'no cache data returned, skip cache update, prompt: %s',
297+
cachePrompt,
298+
);
299+
}
300+
} catch (error) {
301+
debug('cacheFeatureForRect failed: %s', error);
302+
}
353303
} else {
354-
debug(
355-
'no xpaths found, will not update cache',
356-
cachePrompt,
357-
elementXpaths,
358-
);
304+
debug('cacheFeatureForRect is not supported, skip cache update');
359305
}
360306
}
361307
if (!element) {
@@ -375,8 +321,8 @@ export class TaskExecutor {
375321
hitBy = {
376322
from: 'Cache',
377323
context: {
378-
xpathsFromCache: xpaths,
379-
xpathsToSave: currentXpaths,
324+
cacheEntry,
325+
cacheToSave: currentCacheEntry,
380326
},
381327
};
382328
} else if (planHitFlag) {

packages/core/src/agent/utils.ts

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@ import { elementByPositionWithElementInfo } from '@/ai-model';
22
import type { AbstractInterface } from '@/device';
33
import type {
44
BaseElement,
5+
ElementCacheFeature,
56
ElementTreeNode,
67
ExecutionDump,
78
ExecutionTask,
89
ExecutorContext,
10+
LocateResultElement,
911
PlanningLocateParam,
1012
TMultimodalPrompt,
1113
TUserPrompt,
1214
UIContext,
1315
} from '@/index';
1416
import { uploadTestInfoToServer } from '@/utils';
17+
import { NodeType } from '@midscene/shared/constants';
1518
import {
1619
MIDSCENE_REPORT_TAG_NAME,
1720
globalConfigManager,
@@ -169,36 +172,51 @@ export function matchElementFromPlan(
169172

170173
export async function matchElementFromCache(
171174
taskExecutor: TaskExecutor,
172-
xpaths: string[] | undefined,
175+
cacheEntry: ElementCacheFeature | undefined,
173176
cachePrompt: TUserPrompt,
174177
cacheable: boolean | undefined,
175-
) {
178+
): Promise<LocateResultElement | undefined> {
179+
if (!cacheEntry) {
180+
return undefined;
181+
}
182+
183+
if (cacheable === false) {
184+
cacheDebug('cache disabled for prompt: %s', cachePrompt);
185+
return undefined;
186+
}
187+
188+
if (!taskExecutor.taskCache?.isCacheResultUsed) {
189+
return undefined;
190+
}
191+
192+
if (!taskExecutor.interface.rectMatchesCacheFeature) {
193+
cacheDebug(
194+
'interface does not implement rectMatchesCacheFeature, skip cache',
195+
);
196+
return undefined;
197+
}
198+
176199
try {
177-
if (
178-
xpaths?.length &&
179-
taskExecutor.taskCache?.isCacheResultUsed &&
180-
cacheable !== false &&
181-
(taskExecutor.interface as any).getElementInfoByXpath
182-
) {
183-
// hit cache, use new id
184-
for (let i = 0; i < xpaths.length; i++) {
185-
const element = await (
186-
taskExecutor.interface as any
187-
).getElementInfoByXpath(xpaths[i]);
188-
189-
if (element?.id) {
190-
cacheDebug('cache hit, prompt: %s', cachePrompt);
191-
cacheDebug(
192-
'found a new element with same xpath, xpath: %s, id: %s',
193-
xpaths[i],
194-
element?.id,
195-
);
196-
return element;
197-
}
198-
}
199-
}
200+
const rect =
201+
await taskExecutor.interface.rectMatchesCacheFeature(cacheEntry);
202+
const element: LocateResultElement = {
203+
id: uuid(),
204+
center: [
205+
Math.round(rect.left + rect.width / 2),
206+
Math.round(rect.top + rect.height / 2),
207+
],
208+
rect,
209+
xpaths: [],
210+
attributes: {
211+
nodeType: NodeType.POSITION,
212+
},
213+
};
214+
215+
cacheDebug('cache hit, prompt: %s', cachePrompt);
216+
return element;
200217
} catch (error) {
201-
cacheDebug('get element info by xpath error: ', error);
218+
cacheDebug('rectMatchesCacheFeature error: %s', error);
219+
return undefined;
202220
}
203221
}
204222

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import type { TVlModeTypes } from '@midscene/shared/env';
22
export function bboxDescription(vlMode: TVlModeTypes | undefined) {
33
if (vlMode === 'gemini') {
4-
return '2d bounding box as [ymin, xmin, ymax, xmax]';
4+
return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';
55
}
66
return '2d bounding box as [xmin, ymin, xmax, ymax]';
77
}

packages/core/src/device/index.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import type { DeviceAction } from '@/index';
22
import { getMidsceneLocationSchema, z } from '@/index';
33
import type { ElementNode } from '@midscene/shared/extractor';
44
import { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';
5-
import type { Size, UIContext } from '../types';
5+
import type { ElementCacheFeature, Rect, Size, UIContext } from '../types';
66

77
export abstract class AbstractInterface {
88
abstract interfaceType: string;
@@ -11,6 +11,14 @@ export abstract class AbstractInterface {
1111
abstract size(): Promise<Size>;
1212
abstract actionSpace(): DeviceAction[] | Promise<DeviceAction[]>;
1313

14+
abstract cacheFeatureForRect?(
15+
rect: Rect,
16+
opt?: { _orderSensitive: boolean },
17+
): Promise<ElementCacheFeature>;
18+
abstract rectMatchesCacheFeature?(
19+
feature: ElementCacheFeature,
20+
): Promise<Rect>;
21+
1422
abstract destroy?(): Promise<void>;
1523

1624
abstract describe?(): string;

packages/core/src/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
139139

140140
export type InsightExtractParam = string | Record<string, string>;
141141

142+
export type ElementCacheFeature = Record<string, unknown>;
143+
142144
export type LocateResultElement = {
143145
center: [number, number];
144146
rect: Rect;

0 commit comments

Comments
 (0)