feat(core): enable thinking for vqa (#1249)

yuyutaotao · web-flow · commit 73d8b23fcc39 · 2025-09-26T10:57:39.000+08:00
* feat(core): enable thinking for vqa

* fix(core): ci

* feat(core): show thought in report
diff --git a/apps/report/src/components/detail-side/index.tsx b/apps/report/src/components/detail-side/index.tsx
@@ -500,20 +500,35 @@ const DetailSide = (): JSX.Element => {
     } else if (dump?.data !== undefined) {
       data = dump.data;
     }
+
+    const thought = task?.thought;
+
     if (data !== undefined) {
       outputDataContent = (
-        <Card
-          liteMode={true}
-          onMouseEnter={noop}
-          onMouseLeave={noop}
-          content={
-            <pre>
-              {typeof data === 'object'
-                ? JSON.stringify(data, undefined, 2)
-                : String(data)}
-            </pre>
-          }
-        />
+        <>
+          {thought && (
+            <Card
+              liteMode={true}
+              onMouseEnter={noop}
+              onMouseLeave={noop}
+              content={<pre>{thought}</pre>}
+              title="thought"
+            />
+          )}
+          <Card
+            liteMode={true}
+            onMouseEnter={noop}
+            onMouseLeave={noop}
+            title="output"
+            content={
+              <pre>
+                {typeof data === 'object'
+                  ? JSON.stringify(data, undefined, 2)
+                  : String(data)}
+              </pre>
+            }
+          />
+        </>
       );
     }
   }
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
@@ -838,7 +838,6 @@ export class Agent<
       screenshotIncluded:
         opt?.screenshotIncluded ??
         defaultInsightExtractOption.screenshotIncluded,
-      returnThought: opt?.returnThought ?? true,
       isWaitForAssert: opt?.isWaitForAssert,
       doNotThrowError: opt?.doNotThrowError,
     };
diff --git a/packages/core/src/agent/tasks.ts b/packages/core/src/agent/tasks.ts
@@ -1107,7 +1107,6 @@ export class TaskExecutor {
         modelConfig,
         {
           isWaitForAssert: true,
-          returnThought: true,
           doNotThrowError: true,
         },
         multimodalPrompt,
diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -463,13 +463,6 @@ export async function AiExtractElementInfo<
     },
   ];
 
-  if (options.extractOption?.returnThought) {
-    msgs.push({
-      role: 'user',
-      content: 'Please provide reasons.',
-    });
-  }
-
   if (multimodalPrompt) {
     const addOns = await promptsToChatParam({
       images: multimodalPrompt.images,
diff --git a/packages/core/src/ai-model/prompt/extraction.ts b/packages/core/src/ai-model/prompt/extraction.ts
@@ -5,17 +5,16 @@ export function systemPromptToExtract() {
   return `
 You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
 
-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
+The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
 
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 
 If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 
-If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
 
 Return in the following JSON format:
 {
-  thought: string, // the thought process of the extraction, less then 100 words, not required by default.
+  thought: string, // the thinking process of the extraction, less then 300 words
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
   errors: [], // string[], error message if any
 }
@@ -34,6 +33,7 @@ For example, if the DATA_DEMAND is:
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: {
     name: "John",
     age: 30,
@@ -51,6 +51,7 @@ the todo items list, string[]
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: ["todo 1", "todo 2", "todo 3"],
 }
 
@@ -64,6 +65,7 @@ the page title, string
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: "todo list",
 }
 
@@ -79,6 +81,7 @@ If the DATA_DEMAND is:
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: { result: true },
 }
 `;
diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts
@@ -13,7 +13,6 @@ export interface LocateOption {
 export interface InsightExtractOption {
   domIncluded?: boolean | 'visible-only';
   screenshotIncluded?: boolean;
-  returnThought?: boolean;
   // To make the assert in the "waitfor" section display the warning icon in report
   isWaitForAssert?: boolean;
   doNotThrowError?: boolean;
diff --git a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap
@@ -30,17 +30,16 @@ exports[`extract element > systemPromptToExtract 1`] = `
 "
 You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
 
-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
+The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
 
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 
 If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 
-If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
 
 Return in the following JSON format:
 {
-  thought: string, // the thought process of the extraction, less then 100 words, not required by default.
+  thought: string, // the thinking process of the extraction, less then 300 words
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
   errors: [], // string[], error message if any
 }
@@ -59,6 +58,7 @@ For example, if the DATA_DEMAND is:
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: {
     name: "John",
     age: 30,
@@ -76,6 +76,7 @@ the todo items list, string[]
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: ["todo 1", "todo 2", "todo 3"],
 }
 
@@ -89,6 +90,7 @@ the page title, string
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: "todo list",
 }
 
@@ -104,6 +106,7 @@ If the DATA_DEMAND is:
 By viewing the screenshot and page contents, you can extract the following data:
 
 {
+  thought: "According to the screenshot, i can see ...",
   data: { result: true },
 }
 "
diff --git a/packages/web-integration/tests/unit-test/page-task-executor-waitFor.test.ts b/packages/web-integration/tests/unit-test/page-task-executor-waitFor.test.ts
@@ -127,7 +127,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
       mockedModelConfig,
       {
         isWaitForAssert: true,
-        returnThought: true,
         doNotThrowError: true,
       },
       undefined,
@@ -189,7 +188,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
       mockedModelConfig,
       {
         isWaitForAssert: true,
-        returnThought: true,
         doNotThrowError: true,
       },
       undefined,
@@ -259,7 +257,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
       mockedModelConfig,
       {
         isWaitForAssert: true,
-        returnThought: true,
         doNotThrowError: true,
       },
       undefined,

Original file line number	Diff line number	Diff line change
`@@ -1107,7 +1107,6 @@ export class TaskExecutor {`
`1107`	`1107`	`modelConfig,`
`1108`	`1108`	`{`
`1109`	`1109`	`isWaitForAssert: true,`
`1110`		`- returnThought: true,`
`1111`	`1110`	`doNotThrowError: true,`
`1112`	`1111`	`},`
`1113`	`1112`	`multimodalPrompt,`
Original file line number	Diff line number	Diff line change
`@@ -5,17 +5,16 @@ export function systemPromptToExtract() {`
`5`	`5`	return `
`6`	`6`	`You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.`
`7`	`7`
`8`		`-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.`
	`8`	`+The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.`
`9`	`9`
`10`	`10`	`If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.`
`11`	`11`
`12`	`12`	`If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.`
`13`	`13`
`14`		`-If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.`
`15`	`14`
`16`	`15`	`Return in the following JSON format:`
`17`	`16`	`{`
`18`		`- thought: string, // the thought process of the extraction, less then 100 words, not required by default.`
	`17`	`+ thought: string, // the thinking process of the extraction, less then 300 words`
`19`	`18`	`data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.`
`20`	`19`	`errors: [], // string[], error message if any`
`21`	`20`	`}`
`@@ -34,6 +33,7 @@ For example, if the DATA_DEMAND is:`
`34`	`33`	`By viewing the screenshot and page contents, you can extract the following data:`
`35`	`34`
`36`	`35`	`{`
	`36`	`+ thought: "According to the screenshot, i can see ...",`
`37`	`37`	`data: {`
`38`	`38`	`name: "John",`
`39`	`39`	`age: 30,`
`@@ -51,6 +51,7 @@ the todo items list, string[]`
`51`	`51`	`By viewing the screenshot and page contents, you can extract the following data:`
`52`	`52`
`53`	`53`	`{`
	`54`	`+ thought: "According to the screenshot, i can see ...",`
`54`	`55`	`data: ["todo 1", "todo 2", "todo 3"],`
`55`	`56`	`}`
`56`	`57`
`@@ -64,6 +65,7 @@ the page title, string`
`64`	`65`	`By viewing the screenshot and page contents, you can extract the following data:`
`65`	`66`
`66`	`67`	`{`
	`68`	`+ thought: "According to the screenshot, i can see ...",`
`67`	`69`	`data: "todo list",`
`68`	`70`	`}`
`69`	`71`
`@@ -79,6 +81,7 @@ If the DATA_DEMAND is:`
`79`	`81`	`By viewing the screenshot and page contents, you can extract the following data:`
`80`	`82`
`81`	`83`	`{`
	`84`	`+ thought: "According to the screenshot, i can see ...",`
`82`	`85`	`data: { result: true },`
`83`	`86`	`}`
`84`	`87`	`;
Original file line number	Diff line number	Diff line change
@@ -30,17 +30,16 @@ exports[`extract element > systemPromptToExtract 1`] = `
`30`	`30`	`"`
`31`	`31`	`You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.`
`32`	`32`
`33`		`-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.`
	`33`	`+The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.`
`34`	`34`
`35`	`35`	`If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.`
`36`	`36`
`37`	`37`	`If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.`
`38`	`38`
`39`		`-If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.`
`40`	`39`
`41`	`40`	`Return in the following JSON format:`
`42`	`41`	`{`
`43`		`- thought: string, // the thought process of the extraction, less then 100 words, not required by default.`
	`42`	`+ thought: string, // the thinking process of the extraction, less then 300 words`
`44`	`43`	`data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.`
`45`	`44`	`errors: [], // string[], error message if any`
`46`	`45`	`}`
`@@ -59,6 +58,7 @@ For example, if the DATA_DEMAND is:`
`59`	`58`	`By viewing the screenshot and page contents, you can extract the following data:`
`60`	`59`
`61`	`60`	`{`
	`61`	`+ thought: "According to the screenshot, i can see ...",`
`62`	`62`	`data: {`
`63`	`63`	`name: "John",`
`64`	`64`	`age: 30,`
`@@ -76,6 +76,7 @@ the todo items list, string[]`
`76`	`76`	`By viewing the screenshot and page contents, you can extract the following data:`
`77`	`77`
`78`	`78`	`{`
	`79`	`+ thought: "According to the screenshot, i can see ...",`
`79`	`80`	`data: ["todo 1", "todo 2", "todo 3"],`
`80`	`81`	`}`
`81`	`82`
`@@ -89,6 +90,7 @@ the page title, string`
`89`	`90`	`By viewing the screenshot and page contents, you can extract the following data:`
`90`	`91`
`91`	`92`	`{`
	`93`	`+ thought: "According to the screenshot, i can see ...",`
`92`	`94`	`data: "todo list",`
`93`	`95`	`}`
`94`	`96`
`@@ -104,6 +106,7 @@ If the DATA_DEMAND is:`
`104`	`106`	`By viewing the screenshot and page contents, you can extract the following data:`
`105`	`107`
`106`	`108`	`{`
	`109`	`+ thought: "According to the screenshot, i can see ...",`
`107`	`110`	`data: { result: true },`
`108`	`111`	`}`
`109`	`112`	`"`