Skip to content

Commit 73d8b23

Browse files
authored
feat(core): enable thinking for vqa (#1249)
* feat(core): enable thinking for vqa * fix(core): ci * feat(core): show thought in report
1 parent b55ea2c commit 73d8b23

File tree

8 files changed

+39
-31
lines changed

8 files changed

+39
-31
lines changed

apps/report/src/components/detail-side/index.tsx

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -500,20 +500,35 @@ const DetailSide = (): JSX.Element => {
500500
} else if (dump?.data !== undefined) {
501501
data = dump.data;
502502
}
503+
504+
const thought = task?.thought;
505+
503506
if (data !== undefined) {
504507
outputDataContent = (
505-
<Card
506-
liteMode={true}
507-
onMouseEnter={noop}
508-
onMouseLeave={noop}
509-
content={
510-
<pre>
511-
{typeof data === 'object'
512-
? JSON.stringify(data, undefined, 2)
513-
: String(data)}
514-
</pre>
515-
}
516-
/>
508+
<>
509+
{thought && (
510+
<Card
511+
liteMode={true}
512+
onMouseEnter={noop}
513+
onMouseLeave={noop}
514+
content={<pre>{thought}</pre>}
515+
title="thought"
516+
/>
517+
)}
518+
<Card
519+
liteMode={true}
520+
onMouseEnter={noop}
521+
onMouseLeave={noop}
522+
title="output"
523+
content={
524+
<pre>
525+
{typeof data === 'object'
526+
? JSON.stringify(data, undefined, 2)
527+
: String(data)}
528+
</pre>
529+
}
530+
/>
531+
</>
517532
);
518533
}
519534
}

packages/core/src/agent/agent.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,6 @@ export class Agent<
838838
screenshotIncluded:
839839
opt?.screenshotIncluded ??
840840
defaultInsightExtractOption.screenshotIncluded,
841-
returnThought: opt?.returnThought ?? true,
842841
isWaitForAssert: opt?.isWaitForAssert,
843842
doNotThrowError: opt?.doNotThrowError,
844843
};

packages/core/src/agent/tasks.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1107,7 +1107,6 @@ export class TaskExecutor {
11071107
modelConfig,
11081108
{
11091109
isWaitForAssert: true,
1110-
returnThought: true,
11111110
doNotThrowError: true,
11121111
},
11131112
multimodalPrompt,

packages/core/src/ai-model/inspect.ts

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -463,13 +463,6 @@ export async function AiExtractElementInfo<
463463
},
464464
];
465465

466-
if (options.extractOption?.returnThought) {
467-
msgs.push({
468-
role: 'user',
469-
content: 'Please provide reasons.',
470-
});
471-
}
472-
473466
if (multimodalPrompt) {
474467
const addOns = await promptsToChatParam({
475468
images: multimodalPrompt.images,

packages/core/src/ai-model/prompt/extraction.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,16 @@ export function systemPromptToExtract() {
55
return `
66
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
77
8-
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
8+
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
99
1010
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1111
1212
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
1313
14-
If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
1514
1615
Return in the following JSON format:
1716
{
18-
thought: string, // the thought process of the extraction, less then 100 words, not required by default.
17+
thought: string, // the thinking process of the extraction, less then 300 words
1918
data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
2019
errors: [], // string[], error message if any
2120
}
@@ -34,6 +33,7 @@ For example, if the DATA_DEMAND is:
3433
By viewing the screenshot and page contents, you can extract the following data:
3534
3635
{
36+
thought: "According to the screenshot, i can see ...",
3737
data: {
3838
name: "John",
3939
age: 30,
@@ -51,6 +51,7 @@ the todo items list, string[]
5151
By viewing the screenshot and page contents, you can extract the following data:
5252
5353
{
54+
thought: "According to the screenshot, i can see ...",
5455
data: ["todo 1", "todo 2", "todo 3"],
5556
}
5657
@@ -64,6 +65,7 @@ the page title, string
6465
By viewing the screenshot and page contents, you can extract the following data:
6566
6667
{
68+
thought: "According to the screenshot, i can see ...",
6769
data: "todo list",
6870
}
6971
@@ -79,6 +81,7 @@ If the DATA_DEMAND is:
7981
By viewing the screenshot and page contents, you can extract the following data:
8082
8183
{
84+
thought: "According to the screenshot, i can see ...",
8285
data: { result: true },
8386
}
8487
`;

packages/core/src/yaml.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ export interface LocateOption {
1313
export interface InsightExtractOption {
1414
domIncluded?: boolean | 'visible-only';
1515
screenshotIncluded?: boolean;
16-
returnThought?: boolean;
1716
// To make the assert in the "waitfor" section display the warning icon in report
1817
isWaitForAssert?: boolean;
1918
doNotThrowError?: boolean;

packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,16 @@ exports[`extract element > systemPromptToExtract 1`] = `
3030
"
3131
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
3232
33-
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
33+
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
3434
3535
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
3636
3737
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
3838
39-
If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
4039
4140
Return in the following JSON format:
4241
{
43-
thought: string, // the thought process of the extraction, less then 100 words, not required by default.
42+
thought: string, // the thinking process of the extraction, less then 300 words
4443
data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
4544
errors: [], // string[], error message if any
4645
}
@@ -59,6 +58,7 @@ For example, if the DATA_DEMAND is:
5958
By viewing the screenshot and page contents, you can extract the following data:
6059
6160
{
61+
thought: "According to the screenshot, i can see ...",
6262
data: {
6363
name: "John",
6464
age: 30,
@@ -76,6 +76,7 @@ the todo items list, string[]
7676
By viewing the screenshot and page contents, you can extract the following data:
7777
7878
{
79+
thought: "According to the screenshot, i can see ...",
7980
data: ["todo 1", "todo 2", "todo 3"],
8081
}
8182
@@ -89,6 +90,7 @@ the page title, string
8990
By viewing the screenshot and page contents, you can extract the following data:
9091
9192
{
93+
thought: "According to the screenshot, i can see ...",
9294
data: "todo list",
9395
}
9496
@@ -104,6 +106,7 @@ If the DATA_DEMAND is:
104106
By viewing the screenshot and page contents, you can extract the following data:
105107
106108
{
109+
thought: "According to the screenshot, i can see ...",
107110
data: { result: true },
108111
}
109112
"

packages/web-integration/tests/unit-test/page-task-executor-waitFor.test.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
127127
mockedModelConfig,
128128
{
129129
isWaitForAssert: true,
130-
returnThought: true,
131130
doNotThrowError: true,
132131
},
133132
undefined,
@@ -189,7 +188,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
189188
mockedModelConfig,
190189
{
191190
isWaitForAssert: true,
192-
returnThought: true,
193191
doNotThrowError: true,
194192
},
195193
undefined,
@@ -259,7 +257,6 @@ describe('TaskExecutor waitFor method with doNotThrowError', () => {
259257
mockedModelConfig,
260258
{
261259
isWaitForAssert: true,
262-
returnThought: true,
263260
doNotThrowError: true,
264261
},
265262
undefined,

0 commit comments

Comments
 (0)