Skip to content

Commit 90d5c09

Browse files
authored
feat(android): add long press functionality (#963)
* feat(android): add long press functionality * feat(android): add pull down and pull up functionality for refresh actions * feat(android): update action log to include AndroidPull for refresh actions
1 parent 71409ee commit 90d5c09

File tree

11 files changed

+238
-76
lines changed

11 files changed

+238
-76
lines changed

packages/android/src/page/index.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,73 @@ ${Object.keys(size)
779779
await adb.shell('input keyevent 187');
780780
}
781781

782+
async longPress(x: number, y: number, duration = 1000): Promise<void> {
783+
const adb = await this.getAdb();
784+
785+
// Use adjusted coordinates
786+
const { x: adjustedX, y: adjustedY } = this.adjustCoordinates(x, y);
787+
await adb.shell(
788+
`input swipe ${adjustedX} ${adjustedY} ${adjustedX} ${adjustedY} ${duration}`,
789+
);
790+
}
791+
792+
async pullDown(
793+
startPoint?: Point,
794+
distance?: number,
795+
duration = 800,
796+
): Promise<void> {
797+
const { width, height } = await this.size();
798+
799+
// Default start point is near top of screen (but not too close to edge)
800+
const start = startPoint
801+
? { x: startPoint.left, y: startPoint.top }
802+
: { x: width / 2, y: height * 0.15 };
803+
804+
// Default distance is larger to ensure refresh is triggered
805+
const pullDistance = distance || height * 0.5;
806+
const end = { x: start.x, y: start.y + pullDistance };
807+
808+
// Use custom drag with specified duration for better pull-to-refresh detection
809+
await this.pullDrag(start, end, duration);
810+
await sleep(200); // Give more time for refresh to start
811+
}
812+
813+
private async pullDrag(
814+
from: { x: number; y: number },
815+
to: { x: number; y: number },
816+
duration: number,
817+
): Promise<void> {
818+
const adb = await this.getAdb();
819+
820+
// Use adjusted coordinates
821+
const { x: fromX, y: fromY } = this.adjustCoordinates(from.x, from.y);
822+
const { x: toX, y: toY } = this.adjustCoordinates(to.x, to.y);
823+
824+
// Use the specified duration for better pull gesture recognition
825+
await adb.shell(`input swipe ${fromX} ${fromY} ${toX} ${toY} ${duration}`);
826+
}
827+
828+
async pullUp(
829+
startPoint?: Point,
830+
distance?: number,
831+
duration = 600,
832+
): Promise<void> {
833+
const { width, height } = await this.size();
834+
835+
// Default start point is bottom center of screen
836+
const start = startPoint
837+
? { x: startPoint.left, y: startPoint.top }
838+
: { x: width / 2, y: height * 0.85 };
839+
840+
// Default distance is 1/3 of screen height
841+
const pullDistance = distance || height * 0.4;
842+
const end = { x: start.x, y: start.y - pullDistance };
843+
844+
// Use pullDrag for consistent pull gesture handling
845+
await this.pullDrag(start, end, duration);
846+
await sleep(100);
847+
}
848+
782849
async getXpathsById(id: string): Promise<string[]> {
783850
throw new Error('Not implemented');
784851
}

packages/core/src/ai-model/common.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,9 @@ export function buildYamlFlowFromPlans(
376376
} else if (
377377
type === 'AndroidBackButton' ||
378378
type === 'AndroidHomeButton' ||
379-
type === 'AndroidRecentAppsButton'
379+
type === 'AndroidRecentAppsButton' ||
380+
type === 'AndroidLongPress' ||
381+
type === 'AndroidPull'
380382
) {
381383
// not implemented in yaml yet
382384
} else if (

packages/core/src/ai-model/prompt/llm-planning.ts

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i
2525
2626
Restriction:
2727
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
28-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === 'android' ? ', AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton.' : '.'}
28+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === 'android' ? ', AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull.' : '.'}
2929
- Don't repeat actions in the previous logs.
3030
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
3131
@@ -40,7 +40,9 @@ ${
4040
pageType === 'android'
4141
? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
4242
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
43-
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }`
43+
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
44+
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
45+
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')`
4446
: ''
4547
}
4648
@@ -93,7 +95,7 @@ You are a versatile professional in software UI automation. Your outstanding con
9395
## Workflow
9496
9597
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
96-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === 'android' ? '/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton' : ''}). The "About the action" section below will give you more details.
98+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === 'android' ? '/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull' : ''}). The "About the action" section below will give you more details.
9799
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
98100
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
99101
5. Consider whether the user's instruction will be accomplished after all the actions
@@ -154,7 +156,11 @@ ${
154156
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
155157
* {{ param: {{}} }}
156158
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
157-
* {{ param: {{}} }}`
159+
* {{ param: {{}} }}
160+
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
161+
* {{ param: {{ x: number, y: number, duration?: number }} }}
162+
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
163+
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}`
158164
: ''
159165
}
160166
`;
@@ -278,7 +284,7 @@ export const planSchema: ResponseFormatJSONSchema = {
278284
type: {
279285
type: 'string',
280286
description:
281-
'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"',
287+
'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"',
282288
},
283289
param: {
284290
anyOf: [

packages/core/src/ai-model/ui-tars-planning.ts

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ type ActionType =
2828
| 'wait'
2929
| 'androidBackButton'
3030
| 'androidHomeButton'
31-
| 'androidRecentAppsButton';
31+
| 'androidRecentAppsButton'
32+
| 'androidLongPress'
33+
| 'androidPull';
3234

3335
const debug = getDebug('ui-tars-planning');
3436
const bboxSize = 10;
@@ -199,6 +201,42 @@ export async function vlmPlanning(options: {
199201
type: 'AndroidRecentAppsButton',
200202
param: {},
201203
});
204+
} else if (action.action_type === 'androidLongPress') {
205+
assert(
206+
action.action_inputs.start_coords,
207+
'start_coords is required for androidLongPress',
208+
);
209+
const point = action.action_inputs.start_coords;
210+
transformActions.push({
211+
type: 'AndroidLongPress',
212+
param: {
213+
x: point[0],
214+
y: point[1],
215+
duration: 1000,
216+
},
217+
locate: null,
218+
thought: action.thought || '',
219+
});
220+
} else if (action.action_type === 'androidPull') {
221+
const pullDirection = action.action_inputs.direction || 'down';
222+
const startPoint = action.action_inputs.start_coords
223+
? {
224+
x: action.action_inputs.start_coords[0],
225+
y: action.action_inputs.start_coords[1],
226+
}
227+
: undefined;
228+
229+
transformActions.push({
230+
type: 'AndroidPull',
231+
param: {
232+
direction: pullDirection as 'up' | 'down',
233+
startPoint,
234+
distance: (action.action_inputs as any).distance,
235+
duration: (action.action_inputs as any).duration || 500,
236+
},
237+
locate: null,
238+
thought: action.thought || '',
239+
});
202240
}
203241
});
204242

@@ -315,14 +353,23 @@ interface FinishedAction extends BaseAction {
315353
action_inputs: Record<string, never>;
316354
}
317355

356+
interface AndroidLongPressAction extends BaseAction {
357+
action_type: 'androidLongPress';
358+
action_inputs: {
359+
start_coords: [number, number]; // Coordinates for long press
360+
duration?: number; // Duration in milliseconds
361+
};
362+
}
363+
318364
export type Action =
319365
| ClickAction
320366
| DragAction
321367
| TypeAction
322368
| HotkeyAction
323369
| ScrollAction
324370
| FinishedAction
325-
| WaitAction;
371+
| WaitAction
372+
| AndroidLongPressAction;
326373

327374
export async function resizeImageForUiTars(imageBase64: string, size: Size) {
328375
if (

packages/core/src/types.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,9 @@ export interface PlanningAction<ParamType = any> {
272272
| 'Finished'
273273
| 'AndroidBackButton'
274274
| 'AndroidHomeButton'
275-
| 'AndroidRecentAppsButton';
275+
| 'AndroidRecentAppsButton'
276+
| 'AndroidLongPress'
277+
| 'AndroidPull';
276278
param: ParamType;
277279
locate?: PlanningLocateParam | null;
278280
}
@@ -321,6 +323,19 @@ export interface PlanningActionParamError {
321323
export type PlanningActionParamWaitFor = AgentWaitForOpt & {
322324
assertion: string;
323325
};
326+
327+
export interface PlanningActionParamAndroidLongPress {
328+
x: number;
329+
y: number;
330+
duration?: number;
331+
}
332+
333+
export interface PlanningActionParamAndroidPull {
334+
direction: 'up' | 'down';
335+
startPoint?: { x: number; y: number };
336+
distance?: number;
337+
duration?: number;
338+
}
324339
/**
325340
* misc
326341
*/

packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ exports[`system prompts > planning - 4o - response format 1`] = `
483483
"type": "string",
484484
},
485485
"type": {
486-
"description": "Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"",
486+
"description": "Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"",
487487
"type": "string",
488488
},
489489
},
@@ -683,7 +683,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i
683683
684684
Restriction:
685685
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
686-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton.
686+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull.
687687
- Don't repeat actions in the previous logs.
688688
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing 2d bounding box as [xmin, ymin, xmax, ymax].
689689
@@ -697,6 +697,8 @@ Supporting actions:
697697
- AndroidBackButton: { type: "AndroidBackButton", param: {} }
698698
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
699699
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
700+
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
701+
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')
700702
701703
Field description:
702704
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.

packages/web-integration/src/common/page.d.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ export interface AndroidDevicePage extends AbstractPage {
3434
back(): Promise<void>;
3535
home(): Promise<void>;
3636
recentApps(): Promise<void>;
37+
longPress(x: number, y: number, duration?: number): Promise<void>;
38+
pullDown(
39+
startPoint?: Point,
40+
distance?: number,
41+
duration?: number,
42+
): Promise<void>;
43+
pullUp(
44+
startPoint?: Point,
45+
distance?: number,
46+
duration?: number,
47+
): Promise<void>;
3748
}
3849

3950
export type AndroidDeviceInputOpt = {

packages/web-integration/src/common/tasks.ts

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import {
2424
type PageType,
2525
type PlanningAIResponse,
2626
type PlanningAction,
27+
type PlanningActionParamAndroidLongPress,
28+
type PlanningActionParamAndroidPull,
2729
type PlanningActionParamAssert,
2830
type PlanningActionParamError,
2931
type PlanningActionParamHover,
@@ -705,6 +707,57 @@ export class PageTaskExecutor {
705707
},
706708
};
707709
tasks.push(taskActionAndroidRecentAppsButton);
710+
} else if (plan.type === 'AndroidLongPress') {
711+
const taskActionAndroidLongPress: ExecutionTaskActionApply<PlanningActionParamAndroidLongPress> =
712+
{
713+
type: 'Action',
714+
subType: 'AndroidLongPress',
715+
param: plan.param as PlanningActionParamAndroidLongPress,
716+
thought: plan.thought,
717+
locate: plan.locate,
718+
executor: async (param) => {
719+
assert(
720+
isAndroidPage(this.page),
721+
'Cannot use long press on non-Android devices',
722+
);
723+
const { x, y, duration } = param;
724+
await this.page.longPress(x, y, duration);
725+
},
726+
};
727+
tasks.push(taskActionAndroidLongPress);
728+
} else if (plan.type === 'AndroidPull') {
729+
const taskActionAndroidPull: ExecutionTaskActionApply<PlanningActionParamAndroidPull> =
730+
{
731+
type: 'Action',
732+
subType: 'AndroidPull',
733+
param: plan.param as PlanningActionParamAndroidPull,
734+
thought: plan.thought,
735+
locate: plan.locate,
736+
executor: async (param) => {
737+
assert(
738+
isAndroidPage(this.page),
739+
'Cannot use pull action on non-Android devices',
740+
);
741+
const { direction, startPoint, distance, duration } = param;
742+
743+
const convertedStartPoint = startPoint
744+
? { left: startPoint.x, top: startPoint.y }
745+
: undefined;
746+
747+
if (direction === 'down') {
748+
await this.page.pullDown(
749+
convertedStartPoint,
750+
distance,
751+
duration,
752+
);
753+
} else if (direction === 'up') {
754+
await this.page.pullUp(convertedStartPoint, distance, duration);
755+
} else {
756+
throw new Error(`Unknown pull direction: ${direction}`);
757+
}
758+
},
759+
};
760+
tasks.push(taskActionAndroidPull);
708761
} else {
709762
throw new Error(`Unknown or unsupported task type: ${plan.type}`);
710763
}

0 commit comments

Comments
 (0)