Skip to content

Commit 78606b3

Browse files
authored
feat(core): adapt qwen3-vl (#1245)
* feat(core): adapt qwen3-vl * fix(core): ci
1 parent 2b3e491 commit 78606b3

File tree

5 files changed

+30
-34
lines changed

5 files changed

+30
-34
lines changed

packages/core/src/ai-model/inspect.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,10 @@ export async function AiLocateElement<
178178
imageHeight = paddedResult.height;
179179
imagePayload = paddedResult.imageBase64;
180180
} else if (vlMode === 'qwen3-vl') {
181-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
182-
imageWidth = paddedResult.width;
183-
imageHeight = paddedResult.height;
184-
imagePayload = paddedResult.imageBase64;
181+
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
182+
// imageWidth = paddedResult.width;
183+
// imageHeight = paddedResult.height;
184+
// imagePayload = paddedResult.imageBase64;
185185
} else if (!vlMode) {
186186
imagePayload = await markupImageForLLM(
187187
screenshotBase64,

packages/core/src/ai-model/llm-planning.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,10 @@ export async function plan(
6363
imageHeight = paddedResult.height;
6464
imagePayload = paddedResult.imageBase64;
6565
} else if (vlMode === 'qwen3-vl') {
66-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
67-
imageWidth = paddedResult.width;
68-
imageHeight = paddedResult.height;
69-
imagePayload = paddedResult.imageBase64;
66+
// const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
67+
// imageWidth = paddedResult.width;
68+
// imageHeight = paddedResult.height;
69+
// imagePayload = paddedResult.imageBase64;
7070
} else if (!vlMode) {
7171
imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
7272
width: imageWidth,

packages/core/src/ai-model/prompt/llm-planning.ts

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
import assert from 'node:assert';
21
import type { DeviceAction } from '@/types';
3-
import { PromptTemplate } from '@langchain/core/prompts';
42
import type { TVlModeTypes } from '@midscene/shared/env';
53
import type { ResponseFormatJSONSchema } from 'openai/resources/index';
64
import type { ZodObject, z } from 'zod';
75
import { ifMidsceneLocatorField } from '../common';
86
import { bboxDescription } from './common';
97

108
// Note: put the log field first to trigger the CoT
11-
const vlCurrentLog = `"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.`;
9+
10+
const vlCurrentLog = `"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.`;
1211
const llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
1312

1413
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
@@ -215,7 +214,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
215214
this and output the JSON:
216215
217216
{
218-
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
217+
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
219218
"action": {
220219
"type": "Tap",
221220
"param": {
@@ -312,7 +311,7 @@ By viewing the page screenshot and description, you should consider this and out
312311
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
313312
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
314313
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
315-
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
314+
* Compose the log: The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.
316315
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
317316
318317
{
@@ -332,7 +331,7 @@ By viewing the page screenshot and description, you should consider this and out
332331
],
333332
"error": null,
334333
"more_actions_needed_by_instruction": true,
335-
"log": "Click the language switch button to open the language options. Wait for 1 second",
334+
"log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
336335
}
337336
338337
### Example: What NOT to do
@@ -355,7 +354,7 @@ Wrong output:
355354
}
356355
],
357356
"more_actions_needed_by_instruction": false, // WRONG: should be true
358-
"log": "Click the language switch button to open the language options",
357+
"log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
359358
}
360359
`;
361360

packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ By viewing the page screenshot and description, you should consider this and out
554554
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
555555
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
556556
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
557-
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
557+
* Compose the log: The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.
558558
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
559559
560560
{
@@ -574,7 +574,7 @@ By viewing the page screenshot and description, you should consider this and out
574574
],
575575
"error": null,
576576
"more_actions_needed_by_instruction": true,
577-
"log": "Click the language switch button to open the language options. Wait for 1 second",
577+
"log": "The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
578578
}
579579
580580
### Example: What NOT to do
@@ -597,7 +597,7 @@ Wrong output:
597597
}
598598
],
599599
"more_actions_needed_by_instruction": false, // WRONG: should be true
600-
"log": "Click the language switch button to open the language options",
600+
"log": "The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
601601
}
602602
"
603603
`;
@@ -643,7 +643,7 @@ Field description:
643643
644644
Return in JSON format:
645645
{
646-
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
646+
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
647647
"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
648648
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
649649
"action":
@@ -659,7 +659,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
659659
this and output the JSON:
660660
661661
{
662-
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
662+
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
663663
"action": {
664664
"type": "Tap",
665665
"param": {
@@ -715,7 +715,7 @@ Field description:
715715
716716
Return in JSON format:
717717
{
718-
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
718+
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
719719
"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
720720
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
721721
"action":
@@ -731,7 +731,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
731731
this and output the JSON:
732732
733733
{
734-
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
734+
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
735735
"action": {
736736
"type": "Tap",
737737
"param": {
@@ -787,7 +787,7 @@ Field description:
787787
788788
Return in JSON format:
789789
{
790-
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
790+
"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
791791
"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
792792
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
793793
"action":
@@ -803,7 +803,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
803803
this and output the JSON:
804804
805805
{
806-
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
806+
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
807807
"action": {
808808
"type": "Tap",
809809
"param": {

packages/evaluation/tests/llm-locator.test.ts

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import { writeFileSync } from 'node:fs';
22
import Insight, { type Rect } from '@midscene/core';
33
import { sleep } from '@midscene/core/utils';
4-
import {
5-
globalConfigManager,
6-
globalModelConfigManager,
7-
} from '@midscene/shared/env';
4+
import { globalModelConfigManager } from '@midscene/shared/env';
85
import { saveBase64Image } from '@midscene/shared/img';
96

107
import dotenv from 'dotenv';
@@ -19,12 +16,12 @@ dotenv.config({
1916

2017
const testSources = [
2118
'antd-carousel',
22-
// 'todo',
23-
// 'online_order',
24-
// 'online_order_list',
25-
// 'taobao',
26-
// 'aweme-login',
27-
// 'aweme-play',
19+
'todo',
20+
'online_order',
21+
'online_order_list',
22+
'taobao',
23+
'aweme-login',
24+
'aweme-play',
2825
];
2926

3027
let resultCollector: TestResultCollector;

0 commit comments

Comments
 (0)