Skip to content

Commit 0359d23

Browse files
authored
get xpath by coordinate (#952)
* feat(web-integration): get xpath by coordinate like (x,y) * test(shared): add tests * feat(shared): add support for SVG elements and improve XPath generation for text nodes
1 parent 503bdc5 commit 0359d23

File tree

22 files changed

+935
-81
lines changed

22 files changed

+935
-81
lines changed

packages/android/src/page/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,13 @@ ${Object.keys(size)
783783
throw new Error('Not implemented');
784784
}
785785

786+
async getXpathsByPoint(
787+
point: Point,
788+
isOrderSensitive: boolean,
789+
): Promise<string[]> {
790+
throw new Error('Not implemented');
791+
}
792+
786793
async getElementInfoByXpath(xpath: string): Promise<ElementInfo> {
787794
throw new Error('Not implemented');
788795
}

packages/core/src/ai-model/inspect.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ export async function AiLocateElement<
7676
rawResponse: string;
7777
elementById: ElementById;
7878
usage?: AIUsageInfo;
79+
isOrderSensitive?: boolean;
7980
}> {
8081
const { context, targetElementDescription, callAI } = options;
8182
const { screenshotBase64 } = context;
@@ -208,6 +209,12 @@ export async function AiLocateElement<
208209
rawResponse,
209210
elementById,
210211
usage: res.usage,
212+
isOrderSensitive:
213+
typeof res.content === 'object' &&
214+
res.content !== null &&
215+
'isOrderSensitive' in res.content
216+
? (res.content as any).isOrderSensitive
217+
: undefined,
211218
};
212219
}
213220

packages/core/src/ai-model/prompt/llm-locator.ts

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,47 @@ You are an expert in software testing.
1414
## Objective:
1515
- Identify elements in screenshots and text that match the user's description.
1616
- Give the coordinates of the element that matches the user's description best in the screenshot.
17+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
1718
1819
## Output Format:
1920
\`\`\`json
2021
{
2122
"bbox": [number, number, number, number], // ${bboxComment}
22-
"errors"?: string[]
23+
"errors"?: string[],
24+
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
2325
}
2426
\`\`\`
2527
2628
Fields:
2729
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
30+
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
2831
* \`errors\` is an optional array of error messages (if any)
2932
30-
For example, when an element is found:
33+
Order-sensitive means the description contains phrases like:
34+
- "the third item in the list"
35+
- "the last button"
36+
- "the first input box"
37+
- "the second row"
38+
39+
Not order-sensitive means the description is like:
40+
- "confirm button"
41+
- "search box"
42+
- "password input"
43+
44+
For example, when an element is found and the description is order-sensitive:
3145
\`\`\`json
3246
{
3347
"bbox": [100, 100, 200, 200],
48+
"isOrderSensitive": true,
3449
"errors": []
3550
}
3651
\`\`\`
3752
38-
When no element is found:
53+
When no element is found and the description is not order-sensitive:
3954
\`\`\`json
4055
{
4156
"bbox": [],
57+
"isOrderSensitive": false,
4258
"errors": ["I can see ..., but {some element} is not found"]
4359
}
4460
\`\`\`
@@ -52,6 +68,7 @@ You are an expert in software page image (2D) and page element text analysis.
5268
## Objective:
5369
- Identify elements in screenshots and text that match the user's description.
5470
- Return JSON data containing the selection reason and element ID.
71+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
5572
5673
## Skills:
5774
- Image analysis and recognition
@@ -63,6 +80,7 @@ You are an expert in software page image (2D) and page element text analysis.
6380
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
6481
3. Found the required number of elements
6582
4. Return JSON data containing the selection reason and element ID.
83+
5. Judge whether the user's description is order-sensitive (see below for definition and examples).
6684
6785
## Constraints:
6886
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
@@ -72,6 +90,10 @@ You are an expert in software page image (2D) and page element text analysis.
7290
- The returned data must conform to the specified JSON format.
7391
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
7492
93+
## Order-Sensitive Definition:
94+
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
95+
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
96+
7597
## Output Format:
7698
7799
Please return the result in JSON format as follows:
@@ -87,6 +109,7 @@ Please return the result in JSON format as follows:
87109
}
88110
// More elements...
89111
],
112+
"isOrderSensitive": true, // or false, depending on the user's description
90113
"errors": [] // Array of strings containing any error messages
91114
}
92115
\`\`\`
@@ -175,6 +198,7 @@ Output Example:
175198
"id": "1231"
176199
}
177200
],
201+
"isOrderSensitive": true,
178202
"errors": []
179203
}
180204
\`\`\`
@@ -213,6 +237,11 @@ export const locatorSchema: ResponseFormatJSONSchema = {
213237
},
214238
description: 'List of found elements',
215239
},
240+
isOrderSensitive: {
241+
type: 'boolean',
242+
description:
243+
'Whether the targetElementDescription is order-sensitive (true/false)',
244+
},
216245
errors: {
217246
type: 'array',
218247
items: {
@@ -221,7 +250,7 @@ export const locatorSchema: ResponseFormatJSONSchema = {
221250
description: 'List of error messages, if any',
222251
},
223252
},
224-
required: ['elements', 'errors'],
253+
required: ['elements', 'isOrderSensitive', 'errors'],
225254
additionalProperties: false,
226255
},
227256
},

packages/core/src/insight/index.ts

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,19 @@ export default class Insight<
140140
}
141141

142142
const startTime = Date.now();
143-
const { parseResult, rect, elementById, rawResponse, usage } =
144-
await AiLocateElement({
145-
callAI: callAI || this.aiVendorFn,
146-
context,
147-
targetElementDescription: queryPrompt,
148-
searchConfig: searchAreaResponse,
149-
});
143+
const {
144+
parseResult,
145+
rect,
146+
elementById,
147+
rawResponse,
148+
usage,
149+
isOrderSensitive,
150+
} = await AiLocateElement({
151+
callAI: callAI || this.aiVendorFn,
152+
context,
153+
targetElementDescription: queryPrompt,
154+
searchConfig: searchAreaResponse,
155+
});
150156

151157
const timeCost = Date.now() - startTime;
152158
const taskInfo: InsightTaskInfo = {
@@ -219,6 +225,7 @@ export default class Insight<
219225
rect: elements[0]!.rect,
220226
xpaths: elements[0]!.xpaths || [],
221227
attributes: elements[0]!.attributes,
228+
isOrderSensitive,
222229
},
223230
rect,
224231
};

packages/core/src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,13 @@ export interface AIElementLocatorResponse {
6565
xpaths?: string[];
6666
}[];
6767
bbox?: [number, number, number, number];
68+
isOrderSensitive?: boolean;
6869
errors?: string[];
6970
}
7071

7172
export interface AIElementCoordinatesResponse {
7273
bbox: [number, number, number, number];
74+
isOrderSensitive?: boolean;
7375
errors?: string[];
7476
}
7577

@@ -156,6 +158,7 @@ export type LocateResultElement = {
156158
nodeType: NodeType;
157159
[key: string]: string;
158160
};
161+
isOrderSensitive?: boolean;
159162
};
160163

161164
export interface LocateResult {

packages/core/tests/unit-test/executor/__snapshots__/index.test.ts.snap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ exports[`executor > insight - basic run 1`] = `
1212
],
1313
"id": "0",
1414
"indexId": undefined,
15+
"isOrderSensitive": undefined,
1516
"rect": {
1617
"height": 100,
1718
"left": 200,

packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ You are an expert in software page image (2D) and page element text analysis.
112112
## Objective:
113113
- Identify elements in screenshots and text that match the user's description.
114114
- Return JSON data containing the selection reason and element ID.
115+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
115116
116117
## Skills:
117118
- Image analysis and recognition
@@ -123,6 +124,7 @@ You are an expert in software page image (2D) and page element text analysis.
123124
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
124125
3. Found the required number of elements
125126
4. Return JSON data containing the selection reason and element ID.
127+
5. Judge whether the user's description is order-sensitive (see below for definition and examples).
126128
127129
## Constraints:
128130
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
@@ -132,6 +134,10 @@ You are an expert in software page image (2D) and page element text analysis.
132134
- The returned data must conform to the specified JSON format.
133135
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
134136
137+
## Order-Sensitive Definition:
138+
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
139+
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
140+
135141
## Output Format:
136142
137143
Please return the result in JSON format as follows:
@@ -147,6 +153,7 @@ Please return the result in JSON format as follows:
147153
}
148154
// More elements...
149155
],
156+
"isOrderSensitive": true, // or false, depending on the user's description
150157
"errors": [] // Array of strings containing any error messages
151158
}
152159
\`\`\`
@@ -235,6 +242,7 @@ Output Example:
235242
"id": "1231"
236243
}
237244
],
245+
"isOrderSensitive": true,
238246
"errors": []
239247
}
240248
\`\`\`
@@ -250,31 +258,47 @@ You are an expert in software testing.
250258
## Objective:
251259
- Identify elements in screenshots and text that match the user's description.
252260
- Give the coordinates of the element that matches the user's description best in the screenshot.
261+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
253262
254263
## Output Format:
255264
\`\`\`json
256265
{
257266
"bbox": [number, number, number, number], // 2d bounding box as [ymin, xmin, ymax, xmax]
258-
"errors"?: string[]
267+
"errors"?: string[],
268+
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
259269
}
260270
\`\`\`
261271
262272
Fields:
263273
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
274+
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
264275
* \`errors\` is an optional array of error messages (if any)
265276
266-
For example, when an element is found:
277+
Order-sensitive means the description contains phrases like:
278+
- "the third item in the list"
279+
- "the last button"
280+
- "the first input box"
281+
- "the second row"
282+
283+
Not order-sensitive means the description is like:
284+
- "confirm button"
285+
- "search box"
286+
- "password input"
287+
288+
For example, when an element is found and the description is order-sensitive:
267289
\`\`\`json
268290
{
269291
"bbox": [100, 100, 200, 200],
292+
"isOrderSensitive": true,
270293
"errors": []
271294
}
272295
\`\`\`
273296
274-
When no element is found:
297+
When no element is found and the description is not order-sensitive:
275298
\`\`\`json
276299
{
277300
"bbox": [],
301+
"isOrderSensitive": false,
278302
"errors": ["I can see ..., but {some element} is not found"]
279303
}
280304
\`\`\`
@@ -289,31 +313,47 @@ You are an expert in software testing.
289313
## Objective:
290314
- Identify elements in screenshots and text that match the user's description.
291315
- Give the coordinates of the element that matches the user's description best in the screenshot.
316+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
292317
293318
## Output Format:
294319
\`\`\`json
295320
{
296321
"bbox": [number, number, number, number], // 2d bounding box as [xmin, ymin, xmax, ymax]
297-
"errors"?: string[]
322+
"errors"?: string[],
323+
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
298324
}
299325
\`\`\`
300326
301327
Fields:
302328
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
329+
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
303330
* \`errors\` is an optional array of error messages (if any)
304331
305-
For example, when an element is found:
332+
Order-sensitive means the description contains phrases like:
333+
- "the third item in the list"
334+
- "the last button"
335+
- "the first input box"
336+
- "the second row"
337+
338+
Not order-sensitive means the description is like:
339+
- "confirm button"
340+
- "search box"
341+
- "password input"
342+
343+
For example, when an element is found and the description is order-sensitive:
306344
\`\`\`json
307345
{
308346
"bbox": [100, 100, 200, 200],
347+
"isOrderSensitive": true,
309348
"errors": []
310349
}
311350
\`\`\`
312351
313-
When no element is found:
352+
When no element is found and the description is not order-sensitive:
314353
\`\`\`json
315354
{
316355
"bbox": [],
356+
"isOrderSensitive": false,
317357
"errors": ["I can see ..., but {some element} is not found"]
318358
}
319359
\`\`\`

packages/shared/src/extractor/dom-util.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ export function isAElement(
2323
return node instanceof HTMLElement && node.tagName.toLowerCase() === 'a';
2424
}
2525

26+
export function isSvgElement(
27+
node: globalThis.Node,
28+
): node is globalThis.SVGSVGElement {
29+
return node instanceof SVGElement;
30+
}
31+
2632
export function isImgElement(
2733
node: globalThis.Node,
2834
): node is globalThis.HTMLImageElement {

packages/shared/src/extractor/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ export { setNodeHashCacheListOnWindow, getNodeFromCacheList } from './util';
3939

4040
export {
4141
getXpathsById,
42+
getXpathsByPoint,
4243
getNodeInfoByXpath,
4344
getElementInfoByXpath,
4445
} from './locator';

0 commit comments

Comments
 (0)