refactor(core): aiAssert use the same implements as aiBoolean (#992)

EAGzzyCSL · web-flow · commit de46a7b9ac83 · 2025-08-05T16:45:07.000+08:00
diff --git a/packages/core/src/ai-model/action-executor.ts b/packages/core/src/ai-model/action-executor.ts
@@ -63,7 +63,7 @@ export class Executor {
     }
   }
 
-  async flush(): Promise<any> {
+  async flush(): Promise<{ output: any; thought?: string } | undefined> {
     if (this.status === 'init' && this.tasks.length > 0) {
       console.warn(
         'illegal state for executor, status is init but tasks are not empty',
@@ -178,7 +178,11 @@ export class Executor {
     if (this.tasks.length) {
       // return the last output
       const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
-      return this.tasks[outputIndex].output;
+      const { thought, output } = this.tasks[outputIndex];
+      return {
+        thought,
+        output,
+      };
     }
   }
 
diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -427,6 +427,13 @@ export async function AiExtractElementInfo<
     },
   ];
 
+  if (options.extractOption?.returnThought) {
+    msgs.push({
+      role: 'user',
+      content: 'Please provide reasons.',
+    });
+  }
+
   if (multimodalPrompt) {
     const addOns = await promptsToChatParam({
       images: multimodalPrompt.images,
diff --git a/packages/core/src/ai-model/prompt/extraction.ts b/packages/core/src/ai-model/prompt/extraction.ts
@@ -11,8 +11,11 @@ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Ar
 
 If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 
+If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
+
 Return in the following JSON format:
 {
+  thought: string, // the thought process of the extraction, less then 100 words, not required by default.
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
   errors: [], // string[], error message if any
 }
diff --git a/packages/core/src/insight/index.ts b/packages/core/src/insight/index.ts
@@ -242,7 +242,11 @@ export default class Insight<
     dataDemand: InsightExtractParam,
     opt?: InsightExtractOption,
     multimodalPrompt?: TMultimodalPrompt,
-  ): Promise<any> {
+  ): Promise<{
+    data: T;
+    thought?: string;
+    usage?: AIUsageInfo;
+  }> {
     assert(
       typeof dataDemand === 'object' || typeof dataDemand === 'string',
       `dataDemand should be object or string, but get ${typeof dataDemand}`,
@@ -283,7 +287,7 @@ export default class Insight<
       error: errorLog,
     };
 
-    const { data } = parseResult || {};
+    const { data, thought } = parseResult || {};
 
     // 4
     emitInsightDump(
@@ -300,6 +304,7 @@ export default class Insight<
 
     return {
       data,
+      thought,
       usage,
     };
   }
diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts
@@ -82,6 +82,7 @@ export type AIElementResponse =
 export interface AIDataExtractionResponse<DataDemand> {
   data: DataDemand;
   errors?: string[];
+  thought?: string;
 }
 
 export interface AISectionLocatorResponse {
diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts
@@ -9,6 +9,7 @@ export interface LocateOption {
 export interface InsightExtractOption {
   domIncluded?: boolean | 'visible-only';
   screenshotIncluded?: boolean;
+  returnThought?: boolean;
 }
 
 export interface ReferenceImage {
diff --git a/packages/core/tests/unit-test/executor/index.test.ts b/packages/core/tests/unit-test/executor/index.test.ts
@@ -103,7 +103,7 @@ describe(
       const dump = executor.dump();
       expect(dump.logTime).toBeTruthy();
 
-      expect(flushResult).toBe(flushResultData);
+      expect(flushResult?.output).toBe(flushResultData);
     });
 
     it('insight - init and append', async () => {
@@ -177,7 +177,7 @@ describe(
       expect(executor.status).toBe('error');
       expect(executor.latestErrorTask()).toBeTruthy();
       expect(executor.isInErrorState()).toBeTruthy();
-      expect(r).toEqual('error-output');
+      expect(r?.output).toEqual('error-output');
 
       // expect to throw an error
       expect(async () => {
diff --git a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap
@@ -36,8 +36,11 @@ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Ar
 
 If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 
+If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
+
 Return in the following JSON format:
 {
+  thought: string, // the thought process of the extraction, less then 100 words, not required by default.
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
   errors: [], // string[], error message if any
 }
diff --git a/packages/web-integration/src/common/agent.ts b/packages/web-integration/src/common/agent.ts
@@ -616,17 +616,25 @@ export class PageAgent<PageType extends WebPage = WebPage> {
   }
 
   async aiAssert(assertion: TUserPrompt, msg?: string, opt?: AgentAssertOpt) {
-    const { output, executor } = await this.taskExecutor.assert(assertion);
+    const { output, executor, thought } = await this.taskExecutor.assert(
+      assertion,
+      {
+        returnThought: true,
+      },
+    );
     await this.afterTaskRunning(executor, true);
 
-    if (output && opt?.keepRawResponse) {
-      return output;
+    if (opt?.keepRawResponse) {
+      return {
+        pass: output,
+        thought,
+      };
     }
 
-    if (!output?.pass) {
+    if (!output) {
       const errMsg = msg || `Assertion failed: ${assertion}`;
       const reasonMsg = `Reason: ${
-        output?.thought || executor.latestErrorTask()?.error || '(no_reason)'
+        thought || executor.latestErrorTask()?.error || '(no_reason)'
       }`;
       throw new Error(`${errMsg}\n${reasonMsg}`);
     }
diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts
@@ -65,6 +65,7 @@ import {
 
 interface ExecutionResult<OutputType = any> {
   output: OutputType;
+  thought?: string;
   executor: Executor;
 }
 
@@ -1042,8 +1043,9 @@ export class PageTaskExecutor {
     const { tasks } = await this.convertPlanToExecutable(plans, opts);
     await taskExecutor.append(tasks);
     const result = await taskExecutor.flush();
+    const { output } = result!;
     return {
-      output: result,
+      output,
       executor: taskExecutor,
     };
   }
@@ -1085,7 +1087,8 @@ export class PageTaskExecutor {
 
       // plan
       await taskExecutor.append(planningTask);
-      const planResult: PlanningAIResponse = await taskExecutor.flush();
+      const result = await taskExecutor.flush();
+      const planResult: PlanningAIResponse = result?.output;
       if (taskExecutor.isInErrorState()) {
         return {
           output: planResult,
@@ -1167,13 +1170,19 @@ export class PageTaskExecutor {
       const planningTask: ExecutionTaskPlanningApply =
         this.planningTaskToGoal(userPrompt);
       await taskExecutor.append(planningTask);
-      const output = await taskExecutor.flush();
+      const result = await taskExecutor.flush();
       if (taskExecutor.isInErrorState()) {
         return {
           output: undefined,
           executor: taskExecutor,
         };
       }
+      if (!result) {
+        throw new Error(
+          'result of taskExecutor.flush() is undefined in function actionToGoal',
+        );
+      }
+      const { output } = result;
       const plans = output.actions;
       yamlFlow.push(...(output.yamlFlow || []));
       let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;
@@ -1211,7 +1220,7 @@ export class PageTaskExecutor {
   }
 
   private async createTypeQueryTask<T>(
-    type: 'Query' | 'Boolean' | 'Number' | 'String',
+    type: 'Query' | 'Boolean' | 'Number' | 'String' | 'Assert',
     demand: InsightExtractParam,
     opt?: InsightExtractOption,
     multimodalPrompt?: TMultimodalPrompt,
@@ -1254,7 +1263,7 @@ export class PageTaskExecutor {
           };
         }
 
-        const { data, usage } = await this.insight.extract<any>(
+        const { data, usage, thought } = await this.insight.extract<any>(
           demandInput,
           opt,
           multimodalPrompt,
@@ -1270,14 +1279,25 @@ export class PageTaskExecutor {
           output: outputResult,
           log: { dump: insightDump },
           usage,
+          thought,
         };
       },
     };
 
     await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
-    const output = await taskExecutor.flush();
+    const result = await taskExecutor.flush();
+
+    if (!result) {
+      throw new Error(
+        'result of taskExecutor.flush() is undefined in function createTypeQueryTask',
+      );
+    }
+
+    const { output, thought } = result;
+
     return {
       output,
+      thought,
       executor: taskExecutor,
     };
   }
@@ -1330,27 +1350,15 @@ export class PageTaskExecutor {
 
   async assert(
     assertion: TUserPrompt,
-  ): Promise<ExecutionResult<InsightAssertionResponse>> {
-    const description = `assert: ${typeof assertion === 'string' ? assertion : assertion.prompt}`;
-    const taskExecutor = new Executor(taskTitleStr('Assert', description), {
-      onTaskStart: this.onTaskStartCallback,
-    });
-    const assertionPlan: PlanningAction<PlanningActionParamAssert> = {
-      type: 'Assert',
-      param: {
-        assertion,
-      },
-      locate: null,
-    };
-    const { tasks } = await this.convertPlanToExecutable([assertionPlan]);
-
-    await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
-    const output: InsightAssertionResponse = await taskExecutor.flush();
-
-    return {
-      output,
-      executor: taskExecutor,
-    };
+    opt?: InsightExtractOption,
+  ): Promise<ExecutionResult<boolean>> {
+    const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
+    return await this.createTypeQueryTask<boolean>(
+      'Assert',
+      textPrompt,
+      opt,
+      multimodalPrompt,
+    );
   }
 
   /**
@@ -1436,7 +1444,15 @@ export class PageTaskExecutor {
       await taskExecutor.append(
         this.prependExecutorWithScreenshot(assertTasks[0]),
       );
-      const output: InsightAssertionResponse = await taskExecutor.flush();
+      const result = await taskExecutor.flush();
+
+      if (!result) {
+        throw new Error(
+          'result of taskExecutor.flush() is undefined in function waitFor',
+        );
+      }
+
+      const { output } = result as { output: InsightAssertionResponse };
 
       if (output?.pass) {
         return {

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ export class Executor {`
`63`	`63`	`}`
`64`	`64`	`}`
`65`	`65`
`66`		`- async flush(): Promise<any> {`
	`66`	`+ async flush(): Promise<{ output: any; thought?: string } \| undefined> {`
`67`	`67`	`if (this.status === 'init' && this.tasks.length > 0) {`
`68`	`68`	`console.warn(`
`69`	`69`	`'illegal state for executor, status is init but tasks are not empty',`
`@@ -178,7 +178,11 @@ export class Executor {`
`178`	`178`	`if (this.tasks.length) {`
`179`	`179`	`// return the last output`
`180`	`180`	`const outputIndex = Math.min(taskIndex, this.tasks.length - 1);`
`181`		`- return this.tasks[outputIndex].output;`
	`181`	`+ const { thought, output } = this.tasks[outputIndex];`
	`182`	`+ return {`
	`183`	`+ thought,`
	`184`	`+ output,`
	`185`	`+ };`
`182`	`186`	`}`
`183`	`187`	`}`
`184`	`188`
Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,11 @@ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Ar`
`11`	`11`
`12`	`12`	`If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.`
`13`	`13`
	`14`	`+If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.`
	`15`	`+`
`14`	`16`	`Return in the following JSON format:`
`15`	`17`	`{`
	`18`	`+ thought: string, // the thought process of the extraction, less then 100 words, not required by default.`
`16`	`19`	`data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.`
`17`	`20`	`errors: [], // string[], error message if any`
`18`	`21`	`}`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ export type AIElementResponse =`
`82`	`82`	`export interface AIDataExtractionResponse<DataDemand> {`
`83`	`83`	`data: DataDemand;`
`84`	`84`	`errors?: string[];`
	`85`	`+ thought?: string;`
`85`	`86`	`}`
`86`	`87`
`87`	`88`	`export interface AISectionLocatorResponse {`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ export interface LocateOption {`
`9`	`9`	`export interface InsightExtractOption {`
`10`	`10`	`domIncluded?: boolean \| 'visible-only';`
`11`	`11`	`screenshotIncluded?: boolean;`
	`12`	`+ returnThought?: boolean;`
`12`	`13`	`}`
`13`	`14`
`14`	`15`	`export interface ReferenceImage {`
Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,11 @@ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Ar`
`36`	`36`
`37`	`37`	`If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.`
`38`	`38`
	`39`	`+If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.`
	`40`	`+`
`39`	`41`	`Return in the following JSON format:`
`40`	`42`	`{`
	`43`	`+ thought: string, // the thought process of the extraction, less then 100 words, not required by default.`
`41`	`44`	`data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.`
`42`	`45`	`errors: [], // string[], error message if any`
`43`	`46`	`}`