Skip to content

Commit b6694fa

Browse files
authored
feat(core): use action space to expose actions (#1031)
* chore(core): update prompt * feat(core): define action space for llm planning * chore(core): use action space to declare actions for agent * fix(core): prompt for planning * fix(core): lint * feat(core): update prompt for llm planning * fix(core): prompt for planning
1 parent 9b2715f commit b6694fa

File tree

16 files changed

+594
-374
lines changed

16 files changed

+594
-374
lines changed

packages/android/src/page/index.ts

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { randomUUID } from 'node:crypto';
33
import fs from 'node:fs';
44
import path from 'node:path';
55
import { type Point, type Size, getAIConfig } from '@midscene/core';
6-
import type { PageType } from '@midscene/core';
6+
import type { DeviceAction, PageType } from '@midscene/core';
77
import { getTmpFile, sleep } from '@midscene/core/utils';
88
import {
99
MIDSCENE_ADB_PATH,
@@ -15,7 +15,11 @@ import type { ElementInfo } from '@midscene/shared/extractor';
1515
import { isValidPNGImageBuffer, resizeImg } from '@midscene/shared/img';
1616
import { getDebug } from '@midscene/shared/logger';
1717
import { repeat } from '@midscene/shared/utils';
18-
import type { AndroidDeviceInputOpt, AndroidDevicePage } from '@midscene/web';
18+
import {
19+
type AndroidDeviceInputOpt,
20+
type AndroidDevicePage,
21+
commonWebActions,
22+
} from '@midscene/web';
1923
import { ADB } from 'appium-adb';
2024

2125
// only for Android, because it's impossible to scroll to the bottom, so we need to set a default scroll times
@@ -31,6 +35,51 @@ export type AndroidDeviceOpt = {
3135
imeStrategy?: 'always-yadb' | 'yadb-for-non-ascii';
3236
} & AndroidDeviceInputOpt;
3337

38+
const asyncNoop = async () => {};
39+
const androidActions: DeviceAction[] = [
40+
{
41+
name: 'AndroidBackButton',
42+
description: 'Trigger the system "back" operation on Android devices',
43+
location: false,
44+
call: asyncNoop,
45+
},
46+
{
47+
name: 'AndroidHomeButton',
48+
description: 'Trigger the system "home" operation on Android devices',
49+
location: false,
50+
call: asyncNoop,
51+
},
52+
{
53+
name: 'AndroidRecentAppsButton',
54+
description:
55+
'Trigger the system "recent apps" operation on Android devices',
56+
location: false,
57+
call: asyncNoop,
58+
},
59+
{
60+
name: 'AndroidLongPress',
61+
description:
62+
'Trigger a long press on the screen at specified coordinates on Android devices',
63+
paramSchema: '{ duration?: number }',
64+
paramDescription: 'The duration of the long press',
65+
location: 'optional',
66+
whatToLocate: 'The element to be long pressed',
67+
call: asyncNoop,
68+
},
69+
{
70+
name: 'AndroidPull',
71+
description:
72+
'Trigger pull down to refresh or pull up actions on Android devices',
73+
paramSchema:
74+
'{ direction: "up" | "down", distance?: number, duration?: number }',
75+
paramDescription:
76+
'The direction to pull, the distance to pull, and the duration of the pull.',
77+
location: 'optional',
78+
whatToLocate: 'The element to be pulled',
79+
call: asyncNoop,
80+
},
81+
];
82+
3483
export class AndroidDevice implements AndroidDevicePage {
3584
private deviceId: string;
3685
private yadbPushed = false;
@@ -42,6 +91,10 @@ export class AndroidDevice implements AndroidDevicePage {
4291
uri: string | undefined;
4392
options?: AndroidDeviceOpt;
4493

94+
actionSpace(): DeviceAction[] {
95+
return commonWebActions.concat(androidActions);
96+
}
97+
4598
constructor(deviceId: string, options?: AndroidDeviceOpt) {
4699
assert(deviceId, 'deviceId is required for AndroidDevice');
47100

packages/core/src/ai-model/common.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,6 @@ export function buildYamlFlowFromPlans(
378378
// not implemented in yaml yet
379379
} else if (
380380
type === 'Error' ||
381-
type === 'ExpectedFalsyCondition' ||
382381
type === 'Assert' ||
383382
type === 'AssertWithoutThrow' ||
384383
type === 'Finished'

packages/core/src/ai-model/llm-planning.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
import type { PageType, PlanningAIResponse, UIContext } from '@/types';
1+
import type {
2+
DeviceAction,
3+
PageType,
4+
PlanningAIResponse,
5+
UIContext,
6+
} from '@/types';
27
import { vlLocateMode } from '@midscene/shared/env';
38
import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
49
import { assert } from '@midscene/shared/utils';
@@ -23,6 +28,7 @@ export async function plan(
2328
opts: {
2429
context: UIContext;
2530
pageType: PageType;
31+
actionSpace: DeviceAction[];
2632
callAI?: typeof callAiFn<PlanningAIResponse>;
2733
log?: string;
2834
actionContext?: string;
@@ -34,7 +40,7 @@ export async function plan(
3440
await describeUserPage(context);
3541

3642
const systemPrompt = await systemPromptToTaskPlanning({
37-
pageType: opts.pageType,
43+
actionSpace: opts.actionSpace,
3844
vlMode: vlLocateMode(),
3945
});
4046
const taskBackgroundContextText = generateTaskBackgroundContext(

packages/core/src/ai-model/prompt/llm-planning.ts

Lines changed: 135 additions & 161 deletions
Large diffs are not rendered by default.

packages/core/src/types.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,6 @@ export interface PlanningAction<ParamType = any> {
266266
| 'KeyboardPress'
267267
| 'Scroll'
268268
| 'Error'
269-
| 'ExpectedFalsyCondition'
270269
| 'Assert'
271270
| 'AssertWithoutThrow'
272271
| 'Sleep'
@@ -293,12 +292,6 @@ export interface PlanningAIResponse {
293292
yamlString?: string;
294293
}
295294

296-
// export interface PlanningFurtherPlan {
297-
// whatToDoNext: string;
298-
// log: string;
299-
// }
300-
// export type PlanningActionParamPlan = PlanningFurtherPlan;
301-
302295
export type PlanningActionParamTap = null;
303296
export type PlanningActionParamHover = null;
304297
export type PlanningActionParamRightClick = null;
@@ -622,3 +615,13 @@ export type TUserPrompt =
622615
| ({
623616
prompt: string;
624617
} & Partial<TMultimodalPrompt>);
618+
619+
export interface DeviceAction<ParamType = any> {
620+
name: string;
621+
description?: string;
622+
paramSchema?: string;
623+
paramDescription?: string;
624+
location?: 'required' | 'optional' | false;
625+
whatToLocate?: string; // what to locate if location is required or optional
626+
call: (param: ParamType) => Promise<void> | void;
627+
}

packages/core/tests/ai/llm-planning/basic.test.ts

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { plan } from '@/ai-model';
22
import { vlLocateMode } from '@midscene/shared/env';
3+
import { mockActionSpace } from 'tests/common';
34
import { getContextFromFixture } from 'tests/evaluation';
45
/* eslint-disable max-lines-per-function */
56
import { describe, expect, it, vi } from 'vitest';
@@ -19,9 +20,12 @@ describe.skipIf(vlMode)('automation - llm planning', () => {
1920
'type "Why is the earth a sphere?", wait 3.5s, hit Enter',
2021
{
2122
context,
23+
actionSpace: mockActionSpace,
24+
pageType: 'puppeteer',
2225
},
2326
);
2427
expect(actions).toBeTruthy();
28+
2529
expect(actions!.length).toBe(3);
2630
expect(actions![0].type).toBe('Input');
2731
expect(actions![1].type).toBe('Sleep');
@@ -34,7 +38,7 @@ describe.skipIf(vlMode)('automation - llm planning', () => {
3438
const { context } = await getContextFromFixture('todo');
3539
const { actions } = await plan(
3640
'Scroll down the page by 200px, scroll up the page by 100px, scroll right the second item of the task list by 300px',
37-
{ context },
41+
{ context, actionSpace: mockActionSpace, pageType: 'puppeteer' },
3842
);
3943
expect(actions).toBeTruthy();
4044
expect(actions!.length).toBe(3);
@@ -79,7 +83,11 @@ describe('planning', () => {
7983
todoInstructions.forEach(({ name, instruction }) => {
8084
it(`todo mvc - ${name}`, async () => {
8185
const { context } = await getContextFromFixture('todo');
82-
const { actions } = await plan(instruction, { context });
86+
const { actions } = await plan(instruction, {
87+
context,
88+
actionSpace: mockActionSpace,
89+
pageType: 'puppeteer',
90+
});
8391
expect(actions).toBeTruthy();
8492
expect(actions![0].locate).toBeTruthy();
8593
expect(actions![0].locate?.prompt).toBeTruthy();
@@ -93,29 +101,31 @@ describe('planning', () => {
93101
'Scroll left the status filters (with a button named "completed")',
94102
{
95103
context,
104+
actionSpace: mockActionSpace,
105+
pageType: 'puppeteer',
96106
},
97107
);
98108
expect(actions).toBeTruthy();
99109
expect(actions![0].type).toBe('Scroll');
100110
expect(actions![0].locate).toBeTruthy();
101111
});
102112

103-
it.skip('should not throw in an "if" statement', async () => {
113+
it('should not throw in an "if" statement', async () => {
104114
const { context } = await getContextFromFixture('todo');
105115
const { actions, error } = await plan(
106116
'If there is a cookie prompt, close it',
107-
{ context },
117+
{ context, actionSpace: mockActionSpace, pageType: 'puppeteer' },
108118
);
109119

110-
expect(actions?.length === 1).toBeTruthy();
111-
expect(actions?.[0]!.type).toBe('ExpectedFalsyCondition');
120+
expect(error).toBeFalsy();
121+
expect(actions?.length).toBe(0);
112122
});
113123

114124
it('should make mark unfinished when something is not found', async () => {
115125
const { context } = await getContextFromFixture('todo');
116126
const res = await plan(
117-
'click the input box, wait 300ms, click the close button of the cookie prompt',
118-
{ context },
127+
'click the input box, wait 300ms. After that, the page will be redirected to the home page, click the close button of the cookie prompt on the home page',
128+
{ context, actionSpace: mockActionSpace, pageType: 'puppeteer' },
119129
);
120130

121131
expect(res.more_actions_needed_by_instruction).toBeTruthy();

packages/core/tests/common.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import type { DeviceAction } from '@/types';
2+
3+
export const mockActionSpace: DeviceAction[] = [
4+
{
5+
name: 'Tap',
6+
description: 'Tap the element',
7+
location: 'required',
8+
whatToLocate: 'The element to be tapped',
9+
paramSchema: '{ value: string }',
10+
paramDescription: 'The value to be tapped',
11+
call: async () => {},
12+
},
13+
{
14+
name: 'Sleep',
15+
description: 'Sleep for a period of time',
16+
paramSchema: '{ timeMs: number }',
17+
paramDescription: 'The duration of the sleep in milliseconds',
18+
location: false,
19+
call: async () => {},
20+
},
21+
{
22+
name: 'Input',
23+
description: 'Input text into the input field',
24+
paramSchema: '{ value: string }',
25+
paramDescription: 'The value to be input',
26+
location: 'optional',
27+
call: async () => {},
28+
},
29+
{
30+
name: 'KeyboardPress',
31+
description: 'Press a keyboard key',
32+
paramSchema: '{ value: string }',
33+
paramDescription: 'The value to be input',
34+
location: 'optional',
35+
call: async () => {},
36+
},
37+
{
38+
name: 'Scroll',
39+
description: 'Scroll the page',
40+
paramSchema: '{ value: string }',
41+
paramDescription: 'The value to be input',
42+
location: 'optional',
43+
call: async () => {},
44+
},
45+
];

0 commit comments

Comments
 (0)