feat(core): adapt qwen3-vl (#1245)

yuyutaotao · web-flow · commit 78606b3cb24d · 2025-09-25T16:35:39.000+08:00
* feat(core): adapt qwen3-vl

* fix(core): ci
diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -178,10 +178,10 @@ export async function AiLocateElement<
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
   } else if (vlMode === 'qwen3-vl') {
-    const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
-    imageWidth = paddedResult.width;
-    imageHeight = paddedResult.height;
-    imagePayload = paddedResult.imageBase64;
+    // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
+    // imageWidth = paddedResult.width;
+    // imageHeight = paddedResult.height;
+    // imagePayload = paddedResult.imageBase64;
   } else if (!vlMode) {
     imagePayload = await markupImageForLLM(
       screenshotBase64,
diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts
@@ -63,10 +63,10 @@ export async function plan(
     imageHeight = paddedResult.height;
     imagePayload = paddedResult.imageBase64;
   } else if (vlMode === 'qwen3-vl') {
-    const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
-    imageWidth = paddedResult.width;
-    imageHeight = paddedResult.height;
-    imagePayload = paddedResult.imageBase64;
+    // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
+    // imageWidth = paddedResult.width;
+    // imageHeight = paddedResult.height;
+    // imagePayload = paddedResult.imageBase64;
   } else if (!vlMode) {
     imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
       width: imageWidth,
diff --git a/packages/core/src/ai-model/prompt/llm-planning.ts b/packages/core/src/ai-model/prompt/llm-planning.ts
@@ -1,14 +1,13 @@
-import assert from 'node:assert';
 import type { DeviceAction } from '@/types';
-import { PromptTemplate } from '@langchain/core/prompts';
 import type { TVlModeTypes } from '@midscene/shared/env';
 import type { ResponseFormatJSONSchema } from 'openai/resources/index';
 import type { ZodObject, z } from 'zod';
 import { ifMidsceneLocatorField } from '../common';
 import { bboxDescription } from './common';
 
 // Note: put the log field first to trigger the CoT
-const vlCurrentLog = `"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.`;
+
+const vlCurrentLog = `"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.`;
 const llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
 
 const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
@@ -215,7 +214,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
 this and output the JSON:
 
 {
-  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
+  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
   "action": {
     "type": "Tap",
     "param": {
@@ -312,7 +311,7 @@ By viewing the page screenshot and description, you should consider this and out
 * The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
 * Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
 * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
-* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
+* Compose the log: The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.
 * The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
 
 {
@@ -332,7 +331,7 @@ By viewing the page screenshot and description, you should consider this and out
   ],
   "error": null,
   "more_actions_needed_by_instruction": true,
-  "log": "Click the language switch button to open the language options. Wait for 1 second",
+  "log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
 }
 
 ### Example: What NOT to do
@@ -355,7 +354,7 @@ Wrong output:
     }
   ],
   "more_actions_needed_by_instruction": false, // WRONG: should be true
-  "log": "Click the language switch button to open the language options",
+  "log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
 }
 `;
 
diff --git a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap
@@ -554,7 +554,7 @@ By viewing the page screenshot and description, you should consider this and out
 * The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
 * Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
 * The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
-* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
+* Compose the log: The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.
 * The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
 
 {
@@ -574,7 +574,7 @@ By viewing the page screenshot and description, you should consider this and out
   ],
   "error": null,
   "more_actions_needed_by_instruction": true,
-  "log": "Click the language switch button to open the language options. Wait for 1 second",
+  "log": "The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
 }
 
 ### Example: What NOT to do
@@ -597,7 +597,7 @@ Wrong output:
     }
   ],
   "more_actions_needed_by_instruction": false, // WRONG: should be true
-  "log": "Click the language switch button to open the language options",
+  "log": "The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
 }
 "
 `;
@@ -643,7 +643,7 @@ Field description:
 
 Return in JSON format:
 {
-  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
+  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
   "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
   "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
   "action": 
@@ -659,7 +659,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
 this and output the JSON:
 
 {
-  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
+  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
   "action": {
     "type": "Tap",
     "param": {
@@ -715,7 +715,7 @@ Field description:
 
 Return in JSON format:
 {
-  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
+  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
   "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
   "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
   "action": 
@@ -731,7 +731,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
 this and output the JSON:
 
 {
-  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
+  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
   "action": {
     "type": "Tap",
     "param": {
@@ -787,7 +787,7 @@ Field description:
 
 Return in JSON format:
 {
-  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, now i should use action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
+  "log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....". If no action should be done, log the reason. Use the same language as the user's instruction.
   "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
   "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
   "action": 
@@ -803,7 +803,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
 this and output the JSON:
 
 {
-  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, now i should use action 'Tap' to click 'Yes' in popup.",
+  "log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.",
   "action": {
     "type": "Tap",
     "param": {
diff --git a/packages/evaluation/tests/llm-locator.test.ts b/packages/evaluation/tests/llm-locator.test.ts
@@ -1,10 +1,7 @@
 import { writeFileSync } from 'node:fs';
 import Insight, { type Rect } from '@midscene/core';
 import { sleep } from '@midscene/core/utils';
-import {
-  globalConfigManager,
-  globalModelConfigManager,
-} from '@midscene/shared/env';
+import { globalModelConfigManager } from '@midscene/shared/env';
 import { saveBase64Image } from '@midscene/shared/img';
 
 import dotenv from 'dotenv';
@@ -19,12 +16,12 @@ dotenv.config({
 
 const testSources = [
   'antd-carousel',
-  // 'todo',
-  // 'online_order',
-  // 'online_order_list',
-  // 'taobao',
-  // 'aweme-login',
-  // 'aweme-play',
+  'todo',
+  'online_order',
+  'online_order_list',
+  'taobao',
+  'aweme-login',
+  'aweme-play',
 ];
 
 let resultCollector: TestResultCollector;