web-infra-dev
diff --git a/‎packages/core/src/ai-model/common.ts
Lines changed: 1 addition & 1 deletion b/‎packages/core/src/ai-model/common.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/core/src/ai-model/inspect.ts
Lines changed: 107 additions & 10 deletions b/‎packages/core/src/ai-model/inspect.ts
Lines changed: 107 additions & 10 deletions
diff --git a/‎packages/core/src/ai-model/prompt/extraction.ts
Lines changed: 2 additions & 0 deletions b/‎packages/core/src/ai-model/prompt/extraction.ts
Lines changed: 2 additions & 0 deletions
diff --git a/‎packages/core/src/insight/index.ts
Lines changed: 5 additions & 17 deletions b/‎packages/core/src/insight/index.ts
Lines changed: 5 additions & 17 deletions
diff --git a/‎packages/core/src/types.ts
Lines changed: 27 additions & 3 deletions b/‎packages/core/src/types.ts
Lines changed: 27 additions & 3 deletions
@@ -31,7 +31,7 @@ import { getDebug } from '@midscene/shared/logger';
 
 export type AIArgs = [
   ChatCompletionSystemMessageParam,
-  ChatCompletionUserMessageParam,
+  ...ChatCompletionUserMessageParam[],
 ];
 
 export enum AIActionType {
 
@@ -10,6 +10,8 @@ import type {
   InsightExtractOption,
   Rect,
   ReferenceImage,
+  TMultimodalPrompt,
+  TUserPrompt,
   UIContext,
 } from '@/types';
 import {
@@ -18,7 +20,11 @@ import {
   getAIConfigInBoolean,
   vlLocateMode,
 } from '@midscene/shared/env';
-import { cropByRect, paddingToMatchBlockByBase64 } from '@midscene/shared/img';
+import {
+  cropByRect,
+  paddingToMatchBlockByBase64,
+  preProcessImageUrl,
+} from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';
 import { assert } from '@midscene/shared/utils';
 import type {
@@ -56,17 +62,73 @@ import { callToGetJSONObject } from './service-caller/index';
 
 export type AIArgs = [
   ChatCompletionSystemMessageParam,
-  ChatCompletionUserMessageParam,
+  ...ChatCompletionUserMessageParam[],
 ];
 
 const debugInspect = getDebug('ai:inspect');
 const debugSection = getDebug('ai:section');
 
+const extraTextFromUserPrompt = (prompt: TUserPrompt): string => {
+  if (typeof prompt === 'string') {
+    return prompt;
+  } else {
+    return prompt.prompt;
+  }
+};
+
+const promptsToChatParam = async (
+  multimodalPrompt: TMultimodalPrompt,
+): Promise<ChatCompletionUserMessageParam[]> => {
+  const msgs: ChatCompletionUserMessageParam[] = [];
+  if (multimodalPrompt?.images?.length) {
+    msgs.push({
+      role: 'user',
+      content: [
+        {
+          type: 'text',
+          text: 'Next, I will provide all the reference images.',
+        },
+      ],
+    });
+
+    for (const item of multimodalPrompt.images) {
+      const base64 = await preProcessImageUrl(
+        item.url,
+        !!multimodalPrompt.convertHttpImage2Base64,
+      );
+
+      msgs.push({
+        role: 'user',
+        content: [
+          {
+            type: 'text',
+            text: `reference image ${item.name}:`,
+          },
+        ],
+      });
+
+      msgs.push({
+        role: 'user',
+        content: [
+          {
+            type: 'image_url',
+            image_url: {
+              url: base64,
+              detail: 'high',
+            },
+          },
+        ],
+      });
+    }
+  }
+  return msgs;
+};
+
 export async function AiLocateElement<
   ElementType extends BaseElement = BaseElement,
 >(options: {
   context: UIContext<ElementType>;
-  targetElementDescription: string;
+  targetElementDescription: TUserPrompt;
   referenceImage?: ReferenceImage;
   callAI?: typeof callAiFn<AIElementResponse | [number, number]>;
   searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
@@ -90,7 +152,7 @@ export async function AiLocateElement<
 
   const userInstructionPrompt = await findElementPrompt.format({
     pageDescription: description,
-    targetElementDescription,
+    targetElementDescription: extraTextFromUserPrompt(targetElementDescription),
   });
   const systemPrompt = systemPromptToLocateElement(vlLocateMode());
 
@@ -137,6 +199,14 @@ export async function AiLocateElement<
     },
   ];
 
+  if (typeof targetElementDescription !== 'string') {
+    const addOns = await promptsToChatParam({
+      images: targetElementDescription.images,
+      convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,
+    });
+    msgs.push(...addOns);
+  }
+
   const callAIFn =
     callAI || callToGetJSONObject<AIElementResponse | [number, number]>;
 
@@ -211,7 +281,7 @@ export async function AiLocateElement<
 
 export async function AiLocateSection(options: {
   context: UIContext<BaseElement>;
-  sectionDescription: string;
+  sectionDescription: TUserPrompt;
   callAI?: typeof callAiFn<AISectionLocatorResponse>;
 }): Promise<{
   rect?: Rect;
@@ -225,7 +295,7 @@ export async function AiLocateSection(options: {
 
   const systemPrompt = systemPromptToLocateSection(vlLocateMode());
   const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
-    sectionDescription,
+    sectionDescription: extraTextFromUserPrompt(sectionDescription),
   });
   const msgs: AIArgs = [
     { role: 'system', content: systemPrompt },
@@ -247,6 +317,14 @@ export async function AiLocateSection(options: {
     },
   ];
 
+  if (typeof sectionDescription !== 'string') {
+    const addOns = await promptsToChatParam({
+      images: sectionDescription.images,
+      convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,
+    });
+    msgs.push(...addOns);
+  }
+
   const result = await callAiFn<AISectionLocatorResponse>(
     msgs,
     AIActionType.EXTRACT_DATA,
@@ -304,10 +382,11 @@ export async function AiExtractElementInfo<
   ElementType extends BaseElement = BaseElement,
 >(options: {
   dataQuery: string | Record<string, string>;
+  multimodalPrompt?: TMultimodalPrompt;
   context: UIContext<ElementType>;
   extractOption?: InsightExtractOption;
 }) {
-  const { dataQuery, context, extractOption } = options;
+  const { dataQuery, context, extractOption, multimodalPrompt } = options;
   const systemPrompt = systemPromptToExtract();
 
   const { screenshotBase64 } = context;
@@ -348,6 +427,14 @@ export async function AiExtractElementInfo<
     },
   ];
 
+  if (multimodalPrompt) {
+    const addOns = await promptsToChatParam({
+      images: multimodalPrompt.images,
+      convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,
+    });
+    msgs.push(...addOns);
+  }
+
   const result = await callAiFn<AIDataExtractionResponse<T>>(
     msgs,
     AIActionType.EXTRACT_DATA,
@@ -361,17 +448,19 @@ export async function AiExtractElementInfo<
 
 export async function AiAssert<
   ElementType extends BaseElement = BaseElement,
->(options: { assertion: string; context: UIContext<ElementType> }) {
+>(options: { assertion: TUserPrompt; context: UIContext<ElementType> }) {
   const { assertion, context } = options;
 
-  assert(assertion, 'assertion should be a string');
+  assert(assertion, 'assertion should not be empty');
 
   const { screenshotBase64 } = context;
 
   const systemPrompt = systemPromptToAssert({
     isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS),
   });
 
+  const assertionText = extraTextFromUserPrompt(assertion);
+
   const msgs: AIArgs = [
     { role: 'system', content: systemPrompt },
     {
@@ -389,14 +478,22 @@ export async function AiAssert<
           text: `
 Here is the assertion. Please tell whether it is truthy according to the screenshot.
 =====================================
-${assertion}
+${assertionText}
 =====================================
   `,
         },
       ],
     },
   ];
 
+  if (typeof assertion !== 'string') {
+    const addOns = await promptsToChatParam({
+      images: assertion.images,
+      convertHttpImage2Base64: assertion.convertHttpImage2Base64,
+    });
+    msgs.push(...addOns);
+  }
+
   const { content: assertResult, usage } = await callAiFn<AIAssertionResponse>(
     msgs,
     AIActionType.ASSERT,
 
@@ -9,6 +9,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
 
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 
+If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
+
 Return in the following JSON format:
 {
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
 
@@ -27,6 +27,8 @@ import type {
   LocateResult,
   PartialInsightDumpFromSDK,
   Rect,
+  TMultimodalPrompt,
+  TUserPrompt,
   UIContext,
 } from '@/types';
 import {
@@ -236,19 +238,10 @@ export default class Insight<
     };
   }
 
-  async extract<T = any>(input: string, opt?: InsightExtractOption): Promise<T>;
-  async extract<T extends Record<string, string>>(
-    input: T,
-    opt?: InsightExtractOption,
-  ): Promise<Record<keyof T, any>>;
-  async extract<T extends object>(
-    input: Record<keyof T, string>,
-    opt?: InsightExtractOption,
-  ): Promise<T>;
-
   async extract<T>(
     dataDemand: InsightExtractParam,
     opt?: InsightExtractOption,
+    multimodalPrompt?: TMultimodalPrompt,
   ): Promise<any> {
     assert(
       typeof dataDemand === 'object' || typeof dataDemand === 'string',
@@ -263,6 +256,7 @@ export default class Insight<
     const { parseResult, usage } = await AiExtractElementInfo<T>({
       context,
       dataQuery: dataDemand,
+      multimodalPrompt,
       extractOption: opt,
     });
 
@@ -310,13 +304,7 @@ export default class Insight<
     };
   }
 
-  async assert(assertion: string): Promise<InsightAssertionResponse> {
-    if (typeof assertion !== 'string') {
-      throw new Error(
-        'This is the assert method for Midscene, the first argument should be a string. If you want to use the assert method from Node.js, please import it from the Node.js assert module.',
-      );
-    }
-
+  async assert(assertion: TUserPrompt): Promise<InsightAssertionResponse> {
     const dumpSubscriber = this.onceDumpUpdatedFn;
     this.onceDumpUpdatedFn = undefined;
 
 
@@ -192,9 +192,9 @@ export interface InsightDump extends DumpMeta {
   type: 'locate' | 'extract' | 'assert';
   logId: string;
   userQuery: {
-    element?: string;
+    element?: TUserPrompt;
     dataDemand?: InsightExtractParam;
-    assertion?: string;
+    assertion?: TUserPrompt;
   };
   matchedElement: BaseElement[];
   matchedRect?: Rect;
@@ -309,7 +309,7 @@ export interface PlanningActionParamInputOrKeyPress {
 export type PlanningActionParamScroll = scrollParam;
 
 export interface PlanningActionParamAssert {
-  assertion: string;
+  assertion: TUserPrompt;
 }
 
 export interface PlanningActionParamSleep {
@@ -597,3 +597,27 @@ export interface StreamingAIResponse {
   /** Whether the response was streamed */
   isStreamed: boolean;
 }
+
+export type TMultimodalPrompt = {
+  /**
+   * Support use image to inspect elements.
+   * The "images" field is an object that uses image name as key and image url as value.
+   * The image url can be a local path, a http link , or a base64 string.
+   */
+  images?: {
+    name: string;
+    url: string;
+  }[];
+  /**
+   * By default, the image url in the "images" filed starts with `https://` or `http://` will be directly sent to the LLM.
+   * In case the images are not accessible to the LLM (One common case is that image url is internal network only.), you can enable this option.
+   * Then image will be download and convert to base64 format.
+   */
+  convertHttpImage2Base64?: boolean;
+};
+
+export type TUserPrompt =
+  | string
+  | ({
+      prompt: string;
+    } & Partial<TMultimodalPrompt>);
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat`
`9`	`9`
`10`	`10`	`If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.`
`11`	`11`
	`12`	`+If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.`
	`13`	`+`
`12`	`14`	`Return in the following JSON format:`
`13`	`15`	`{`
`14`	`16`	`data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.`