Skip to content

Commit b85bc88

Browse files
authored
feat(llm): image prompt (#940)
1 parent 01870d5 commit b85bc88

File tree

17 files changed

+377
-71
lines changed

17 files changed

+377
-71
lines changed

packages/core/src/ai-model/common.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import { getDebug } from '@midscene/shared/logger';
3131

3232
export type AIArgs = [
3333
ChatCompletionSystemMessageParam,
34-
ChatCompletionUserMessageParam,
34+
...ChatCompletionUserMessageParam[],
3535
];
3636

3737
export enum AIActionType {

packages/core/src/ai-model/inspect.ts

Lines changed: 107 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import type {
1010
InsightExtractOption,
1111
Rect,
1212
ReferenceImage,
13+
TMultimodalPrompt,
14+
TUserPrompt,
1315
UIContext,
1416
} from '@/types';
1517
import {
@@ -18,7 +20,11 @@ import {
1820
getAIConfigInBoolean,
1921
vlLocateMode,
2022
} from '@midscene/shared/env';
21-
import { cropByRect, paddingToMatchBlockByBase64 } from '@midscene/shared/img';
23+
import {
24+
cropByRect,
25+
paddingToMatchBlockByBase64,
26+
preProcessImageUrl,
27+
} from '@midscene/shared/img';
2228
import { getDebug } from '@midscene/shared/logger';
2329
import { assert } from '@midscene/shared/utils';
2430
import type {
@@ -56,17 +62,73 @@ import { callToGetJSONObject } from './service-caller/index';
5662

5763
export type AIArgs = [
5864
ChatCompletionSystemMessageParam,
59-
ChatCompletionUserMessageParam,
65+
...ChatCompletionUserMessageParam[],
6066
];
6167

6268
const debugInspect = getDebug('ai:inspect');
6369
const debugSection = getDebug('ai:section');
6470

71+
const extraTextFromUserPrompt = (prompt: TUserPrompt): string => {
72+
if (typeof prompt === 'string') {
73+
return prompt;
74+
} else {
75+
return prompt.prompt;
76+
}
77+
};
78+
79+
const promptsToChatParam = async (
80+
multimodalPrompt: TMultimodalPrompt,
81+
): Promise<ChatCompletionUserMessageParam[]> => {
82+
const msgs: ChatCompletionUserMessageParam[] = [];
83+
if (multimodalPrompt?.images?.length) {
84+
msgs.push({
85+
role: 'user',
86+
content: [
87+
{
88+
type: 'text',
89+
text: 'Next, I will provide all the reference images.',
90+
},
91+
],
92+
});
93+
94+
for (const item of multimodalPrompt.images) {
95+
const base64 = await preProcessImageUrl(
96+
item.url,
97+
!!multimodalPrompt.convertHttpImage2Base64,
98+
);
99+
100+
msgs.push({
101+
role: 'user',
102+
content: [
103+
{
104+
type: 'text',
105+
text: `reference image ${item.name}:`,
106+
},
107+
],
108+
});
109+
110+
msgs.push({
111+
role: 'user',
112+
content: [
113+
{
114+
type: 'image_url',
115+
image_url: {
116+
url: base64,
117+
detail: 'high',
118+
},
119+
},
120+
],
121+
});
122+
}
123+
}
124+
return msgs;
125+
};
126+
65127
export async function AiLocateElement<
66128
ElementType extends BaseElement = BaseElement,
67129
>(options: {
68130
context: UIContext<ElementType>;
69-
targetElementDescription: string;
131+
targetElementDescription: TUserPrompt;
70132
referenceImage?: ReferenceImage;
71133
callAI?: typeof callAiFn<AIElementResponse | [number, number]>;
72134
searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
@@ -90,7 +152,7 @@ export async function AiLocateElement<
90152

91153
const userInstructionPrompt = await findElementPrompt.format({
92154
pageDescription: description,
93-
targetElementDescription,
155+
targetElementDescription: extraTextFromUserPrompt(targetElementDescription),
94156
});
95157
const systemPrompt = systemPromptToLocateElement(vlLocateMode());
96158

@@ -137,6 +199,14 @@ export async function AiLocateElement<
137199
},
138200
];
139201

202+
if (typeof targetElementDescription !== 'string') {
203+
const addOns = await promptsToChatParam({
204+
images: targetElementDescription.images,
205+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,
206+
});
207+
msgs.push(...addOns);
208+
}
209+
140210
const callAIFn =
141211
callAI || callToGetJSONObject<AIElementResponse | [number, number]>;
142212

@@ -211,7 +281,7 @@ export async function AiLocateElement<
211281

212282
export async function AiLocateSection(options: {
213283
context: UIContext<BaseElement>;
214-
sectionDescription: string;
284+
sectionDescription: TUserPrompt;
215285
callAI?: typeof callAiFn<AISectionLocatorResponse>;
216286
}): Promise<{
217287
rect?: Rect;
@@ -225,7 +295,7 @@ export async function AiLocateSection(options: {
225295

226296
const systemPrompt = systemPromptToLocateSection(vlLocateMode());
227297
const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
228-
sectionDescription,
298+
sectionDescription: extraTextFromUserPrompt(sectionDescription),
229299
});
230300
const msgs: AIArgs = [
231301
{ role: 'system', content: systemPrompt },
@@ -247,6 +317,14 @@ export async function AiLocateSection(options: {
247317
},
248318
];
249319

320+
if (typeof sectionDescription !== 'string') {
321+
const addOns = await promptsToChatParam({
322+
images: sectionDescription.images,
323+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,
324+
});
325+
msgs.push(...addOns);
326+
}
327+
250328
const result = await callAiFn<AISectionLocatorResponse>(
251329
msgs,
252330
AIActionType.EXTRACT_DATA,
@@ -304,10 +382,11 @@ export async function AiExtractElementInfo<
304382
ElementType extends BaseElement = BaseElement,
305383
>(options: {
306384
dataQuery: string | Record<string, string>;
385+
multimodalPrompt?: TMultimodalPrompt;
307386
context: UIContext<ElementType>;
308387
extractOption?: InsightExtractOption;
309388
}) {
310-
const { dataQuery, context, extractOption } = options;
389+
const { dataQuery, context, extractOption, multimodalPrompt } = options;
311390
const systemPrompt = systemPromptToExtract();
312391

313392
const { screenshotBase64 } = context;
@@ -348,6 +427,14 @@ export async function AiExtractElementInfo<
348427
},
349428
];
350429

430+
if (multimodalPrompt) {
431+
const addOns = await promptsToChatParam({
432+
images: multimodalPrompt.images,
433+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,
434+
});
435+
msgs.push(...addOns);
436+
}
437+
351438
const result = await callAiFn<AIDataExtractionResponse<T>>(
352439
msgs,
353440
AIActionType.EXTRACT_DATA,
@@ -361,17 +448,19 @@ export async function AiExtractElementInfo<
361448

362449
export async function AiAssert<
363450
ElementType extends BaseElement = BaseElement,
364-
>(options: { assertion: string; context: UIContext<ElementType> }) {
451+
>(options: { assertion: TUserPrompt; context: UIContext<ElementType> }) {
365452
const { assertion, context } = options;
366453

367-
assert(assertion, 'assertion should be a string');
454+
assert(assertion, 'assertion should not be empty');
368455

369456
const { screenshotBase64 } = context;
370457

371458
const systemPrompt = systemPromptToAssert({
372459
isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS),
373460
});
374461

462+
const assertionText = extraTextFromUserPrompt(assertion);
463+
375464
const msgs: AIArgs = [
376465
{ role: 'system', content: systemPrompt },
377466
{
@@ -389,14 +478,22 @@ export async function AiAssert<
389478
text: `
390479
Here is the assertion. Please tell whether it is truthy according to the screenshot.
391480
=====================================
392-
${assertion}
481+
${assertionText}
393482
=====================================
394483
`,
395484
},
396485
],
397486
},
398487
];
399488

489+
if (typeof assertion !== 'string') {
490+
const addOns = await promptsToChatParam({
491+
images: assertion.images,
492+
convertHttpImage2Base64: assertion.convertHttpImage2Base64,
493+
});
494+
msgs.push(...addOns);
495+
}
496+
400497
const { content: assertResult, usage } = await callAiFn<AIAssertionResponse>(
401498
msgs,
402499
AIActionType.ASSERT,

packages/core/src/ai-model/prompt/extraction.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
99
1010
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1111
12+
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
13+
1214
Return in the following JSON format:
1315
{
1416
data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.

packages/core/src/insight/index.ts

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ import type {
2727
LocateResult,
2828
PartialInsightDumpFromSDK,
2929
Rect,
30+
TMultimodalPrompt,
31+
TUserPrompt,
3032
UIContext,
3133
} from '@/types';
3234
import {
@@ -236,19 +238,10 @@ export default class Insight<
236238
};
237239
}
238240

239-
async extract<T = any>(input: string, opt?: InsightExtractOption): Promise<T>;
240-
async extract<T extends Record<string, string>>(
241-
input: T,
242-
opt?: InsightExtractOption,
243-
): Promise<Record<keyof T, any>>;
244-
async extract<T extends object>(
245-
input: Record<keyof T, string>,
246-
opt?: InsightExtractOption,
247-
): Promise<T>;
248-
249241
async extract<T>(
250242
dataDemand: InsightExtractParam,
251243
opt?: InsightExtractOption,
244+
multimodalPrompt?: TMultimodalPrompt,
252245
): Promise<any> {
253246
assert(
254247
typeof dataDemand === 'object' || typeof dataDemand === 'string',
@@ -263,6 +256,7 @@ export default class Insight<
263256
const { parseResult, usage } = await AiExtractElementInfo<T>({
264257
context,
265258
dataQuery: dataDemand,
259+
multimodalPrompt,
266260
extractOption: opt,
267261
});
268262

@@ -310,13 +304,7 @@ export default class Insight<
310304
};
311305
}
312306

313-
async assert(assertion: string): Promise<InsightAssertionResponse> {
314-
if (typeof assertion !== 'string') {
315-
throw new Error(
316-
'This is the assert method for Midscene, the first argument should be a string. If you want to use the assert method from Node.js, please import it from the Node.js assert module.',
317-
);
318-
}
319-
307+
async assert(assertion: TUserPrompt): Promise<InsightAssertionResponse> {
320308
const dumpSubscriber = this.onceDumpUpdatedFn;
321309
this.onceDumpUpdatedFn = undefined;
322310

packages/core/src/types.ts

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ export interface InsightDump extends DumpMeta {
192192
type: 'locate' | 'extract' | 'assert';
193193
logId: string;
194194
userQuery: {
195-
element?: string;
195+
element?: TUserPrompt;
196196
dataDemand?: InsightExtractParam;
197-
assertion?: string;
197+
assertion?: TUserPrompt;
198198
};
199199
matchedElement: BaseElement[];
200200
matchedRect?: Rect;
@@ -309,7 +309,7 @@ export interface PlanningActionParamInputOrKeyPress {
309309
export type PlanningActionParamScroll = scrollParam;
310310

311311
export interface PlanningActionParamAssert {
312-
assertion: string;
312+
assertion: TUserPrompt;
313313
}
314314

315315
export interface PlanningActionParamSleep {
@@ -597,3 +597,27 @@ export interface StreamingAIResponse {
597597
/** Whether the response was streamed */
598598
isStreamed: boolean;
599599
}
600+
601+
export type TMultimodalPrompt = {
602+
/**
603+
* Support use image to inspect elements.
604+
* The "images" field is an object that uses image name as key and image url as value.
605+
* The image url can be a local path, a http link , or a base64 string.
606+
*/
607+
images?: {
608+
name: string;
609+
url: string;
610+
}[];
611+
/**
612+
* By default, the image url in the "images" filed starts with `https://` or `http://` will be directly sent to the LLM.
613+
* In case the images are not accessible to the LLM (One common case is that image url is internal network only.), you can enable this option.
614+
* Then image will be download and convert to base64 format.
615+
*/
616+
convertHttpImage2Base64?: boolean;
617+
};
618+
619+
export type TUserPrompt =
620+
| string
621+
| ({
622+
prompt: string;
623+
} & Partial<TMultimodalPrompt>);

0 commit comments

Comments
 (0)