Skip to content

Commit 48d8587

Browse files
authored
feat: add yolo mode support to auto vision model switch (QwenLM#652)
* feat: add yolo mode support to auto vision model switch * feat: add cli args & env variables for switch behavoir * fix: use dedicated model names and settings * docs: add vision model instructions * fix: failed test case * fix: setModel failure
1 parent 5ecb4a2 commit 48d8587

26 files changed

+1133
-122
lines changed

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ For detailed setup instructions, see [Authorization](#authorization).
5454
- **Code Understanding & Editing** - Query and edit large codebases beyond traditional context window limits
5555
- **Workflow Automation** - Automate operational tasks like handling pull requests and complex rebases
5656
- **Enhanced Parser** - Adapted parser specifically optimized for Qwen-Coder models
57+
- **Vision Model Support** - Automatically detect images in your input and seamlessly switch to vision-capable models for multimodal analysis
5758

5859
## Installation
5960

@@ -121,6 +122,58 @@ Create or edit `.qwen/settings.json` in your home directory:
121122

122123
> 📝 **Note**: Session token limit applies to a single conversation, not cumulative API calls.
123124
125+
### Vision Model Configuration
126+
127+
Qwen Code includes intelligent vision model auto-switching that detects images in your input and can automatically switch to vision-capable models for multimodal analysis. **This feature is enabled by default** - when you include images in your queries, you'll see a dialog asking how you'd like to handle the vision model switch.
128+
129+
#### Skip the Switch Dialog (Optional)
130+
131+
If you don't want to see the interactive dialog each time, configure the default behavior in your `.qwen/settings.json`:
132+
133+
```json
134+
{
135+
"experimental": {
136+
"vlmSwitchMode": "once"
137+
}
138+
}
139+
```
140+
141+
**Available modes:**
142+
143+
- **`"once"`** - Switch to vision model for this query only, then revert
144+
- **`"session"`** - Switch to vision model for the entire session
145+
- **`"persist"`** - Continue with current model (no switching)
146+
- **Not set** - Show interactive dialog each time (default)
147+
148+
#### Command Line Override
149+
150+
You can also set the behavior via command line:
151+
152+
```bash
153+
# Switch once per query
154+
qwen --vlm-switch-mode once
155+
156+
# Switch for entire session
157+
qwen --vlm-switch-mode session
158+
159+
# Never switch automatically
160+
qwen --vlm-switch-mode persist
161+
```
162+
163+
#### Disable Vision Models (Optional)
164+
165+
To completely disable vision model support, add to your `.qwen/settings.json`:
166+
167+
```json
168+
{
169+
"experimental": {
170+
"visionModelPreview": false
171+
}
172+
}
173+
```
174+
175+
> 💡 **Tip**: In YOLO mode (`--yolo`), vision switching happens automatically without prompts when images are detected.
176+
124177
### Authorization
125178

126179
Choose your preferred authentication method based on your needs:

packages/cli/src/config/config.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1514,7 +1514,7 @@ describe('loadCliConfig model selection', () => {
15141514
argv,
15151515
);
15161516

1517-
expect(config.getModel()).toBe('qwen3-coder-plus');
1517+
expect(config.getModel()).toBe('coder-model');
15181518
});
15191519

15201520
it('always prefers model from argvs', async () => {

packages/cli/src/config/config.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ export interface CliArgs {
8282
includeDirectories: string[] | undefined;
8383
tavilyApiKey: string | undefined;
8484
screenReader: boolean | undefined;
85+
vlmSwitchMode: string | undefined;
8586
}
8687

8788
export async function parseArguments(settings: Settings): Promise<CliArgs> {
@@ -249,6 +250,13 @@ export async function parseArguments(settings: Settings): Promise<CliArgs> {
249250
description: 'Enable screen reader mode for accessibility.',
250251
default: false,
251252
})
253+
.option('vlm-switch-mode', {
254+
type: 'string',
255+
choices: ['once', 'session', 'persist'],
256+
description:
257+
'Default behavior when images are detected in input. Values: once (one-time switch), session (switch for entire session), persist (continue with current model). Overrides settings files.',
258+
default: process.env['VLM_SWITCH_MODE'],
259+
})
252260
.check((argv) => {
253261
if (argv.prompt && argv['promptInteractive']) {
254262
throw new Error(
@@ -524,6 +532,9 @@ export async function loadCliConfig(
524532
argv.screenReader !== undefined
525533
? argv.screenReader
526534
: (settings.ui?.accessibility?.screenReader ?? false);
535+
536+
const vlmSwitchMode =
537+
argv.vlmSwitchMode || settings.experimental?.vlmSwitchMode;
527538
return new Config({
528539
sessionId,
529540
embeddingModel: DEFAULT_GEMINI_EMBEDDING_MODEL,
@@ -630,6 +641,7 @@ export async function loadCliConfig(
630641
skipNextSpeakerCheck: settings.model?.skipNextSpeakerCheck,
631642
enablePromptCompletion: settings.general?.enablePromptCompletion ?? false,
632643
skipLoopDetection: settings.skipLoopDetection ?? false,
644+
vlmSwitchMode,
633645
});
634646
}
635647

packages/cli/src/config/settings.test.ts

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ const MOCK_WORKSPACE_SETTINGS_PATH = pathActual.join(
6969
);
7070

7171
// A more flexible type for test data that allows arbitrary properties.
72-
type TestSettings = Settings & { [key: string]: unknown };
72+
type TestSettings = Settings & {
73+
[key: string]: unknown;
74+
nested?: { [key: string]: unknown };
75+
nestedObj?: { [key: string]: unknown };
76+
};
7377

7478
vi.mock('fs', async (importOriginal) => {
7579
// Get all the functions from the real 'fs' module
@@ -137,6 +141,9 @@ describe('Settings Loading and Merging', () => {
137141
advanced: {
138142
excludedEnvVars: [],
139143
},
144+
experimental: {},
145+
contentGenerator: {},
146+
systemPromptMappings: {},
140147
extensions: {
141148
disabled: [],
142149
workspacesWithMigrationNudge: [],
@@ -197,6 +204,9 @@ describe('Settings Loading and Merging', () => {
197204
advanced: {
198205
excludedEnvVars: [],
199206
},
207+
experimental: {},
208+
contentGenerator: {},
209+
systemPromptMappings: {},
200210
extensions: {
201211
disabled: [],
202212
workspacesWithMigrationNudge: [],
@@ -260,6 +270,9 @@ describe('Settings Loading and Merging', () => {
260270
advanced: {
261271
excludedEnvVars: [],
262272
},
273+
experimental: {},
274+
contentGenerator: {},
275+
systemPromptMappings: {},
263276
extensions: {
264277
disabled: [],
265278
workspacesWithMigrationNudge: [],
@@ -320,6 +333,9 @@ describe('Settings Loading and Merging', () => {
320333
advanced: {
321334
excludedEnvVars: [],
322335
},
336+
experimental: {},
337+
contentGenerator: {},
338+
systemPromptMappings: {},
323339
extensions: {
324340
disabled: [],
325341
workspacesWithMigrationNudge: [],
@@ -385,6 +401,9 @@ describe('Settings Loading and Merging', () => {
385401
advanced: {
386402
excludedEnvVars: [],
387403
},
404+
experimental: {},
405+
contentGenerator: {},
406+
systemPromptMappings: {},
388407
extensions: {
389408
disabled: [],
390409
workspacesWithMigrationNudge: [],
@@ -477,6 +496,9 @@ describe('Settings Loading and Merging', () => {
477496
advanced: {
478497
excludedEnvVars: [],
479498
},
499+
experimental: {},
500+
contentGenerator: {},
501+
systemPromptMappings: {},
480502
extensions: {
481503
disabled: [],
482504
workspacesWithMigrationNudge: [],
@@ -562,6 +584,9 @@ describe('Settings Loading and Merging', () => {
562584
advanced: {
563585
excludedEnvVars: [],
564586
},
587+
experimental: {},
588+
contentGenerator: {},
589+
systemPromptMappings: {},
565590
extensions: {
566591
disabled: [],
567592
workspacesWithMigrationNudge: [],
@@ -691,6 +716,9 @@ describe('Settings Loading and Merging', () => {
691716
'/system/dir',
692717
],
693718
},
719+
experimental: {},
720+
contentGenerator: {},
721+
systemPromptMappings: {},
694722
extensions: {
695723
disabled: [],
696724
workspacesWithMigrationNudge: [],
@@ -1431,6 +1459,9 @@ describe('Settings Loading and Merging', () => {
14311459
advanced: {
14321460
excludedEnvVars: [],
14331461
},
1462+
experimental: {},
1463+
contentGenerator: {},
1464+
systemPromptMappings: {},
14341465
extensions: {
14351466
disabled: [],
14361467
workspacesWithMigrationNudge: [],
@@ -1516,7 +1547,11 @@ describe('Settings Loading and Merging', () => {
15161547
'workspace_endpoint_from_env/api',
15171548
);
15181549
expect(
1519-
(settings.workspace.settings as TestSettings)['nested']['value'],
1550+
(
1551+
(settings.workspace.settings as TestSettings).nested as {
1552+
[key: string]: unknown;
1553+
}
1554+
)['value'],
15201555
).toBe('workspace_endpoint_from_env');
15211556
expect((settings.merged as TestSettings)['endpoint']).toBe(
15221557
'workspace_endpoint_from_env/api',
@@ -1766,19 +1801,39 @@ describe('Settings Loading and Merging', () => {
17661801
).toBeUndefined();
17671802

17681803
expect(
1769-
(settings.user.settings as TestSettings)['nestedObj']['nestedNull'],
1804+
(
1805+
(settings.user.settings as TestSettings).nestedObj as {
1806+
[key: string]: unknown;
1807+
}
1808+
)['nestedNull'],
17701809
).toBeNull();
17711810
expect(
1772-
(settings.user.settings as TestSettings)['nestedObj']['nestedBool'],
1811+
(
1812+
(settings.user.settings as TestSettings).nestedObj as {
1813+
[key: string]: unknown;
1814+
}
1815+
)['nestedBool'],
17731816
).toBe(true);
17741817
expect(
1775-
(settings.user.settings as TestSettings)['nestedObj']['nestedNum'],
1818+
(
1819+
(settings.user.settings as TestSettings).nestedObj as {
1820+
[key: string]: unknown;
1821+
}
1822+
)['nestedNum'],
17761823
).toBe(0);
17771824
expect(
1778-
(settings.user.settings as TestSettings)['nestedObj']['nestedString'],
1825+
(
1826+
(settings.user.settings as TestSettings).nestedObj as {
1827+
[key: string]: unknown;
1828+
}
1829+
)['nestedString'],
17791830
).toBe('literal');
17801831
expect(
1781-
(settings.user.settings as TestSettings)['nestedObj']['anotherEnv'],
1832+
(
1833+
(settings.user.settings as TestSettings).nestedObj as {
1834+
[key: string]: unknown;
1835+
}
1836+
)['anotherEnv'],
17821837
).toBe('env_string_nested_value');
17831838

17841839
delete process.env['MY_ENV_STRING'];
@@ -1864,6 +1919,9 @@ describe('Settings Loading and Merging', () => {
18641919
advanced: {
18651920
excludedEnvVars: [],
18661921
},
1922+
experimental: {},
1923+
contentGenerator: {},
1924+
systemPromptMappings: {},
18671925
extensions: {
18681926
disabled: [],
18691927
workspacesWithMigrationNudge: [],
@@ -2336,14 +2394,14 @@ describe('Settings Loading and Merging', () => {
23362394
vimMode: false,
23372395
},
23382396
model: {
2339-
maxSessionTurns: 0,
2397+
maxSessionTurns: -1,
23402398
},
23412399
context: {
23422400
includeDirectories: [],
23432401
},
23442402
security: {
23452403
folderTrust: {
2346-
enabled: null,
2404+
enabled: false,
23472405
},
23482406
},
23492407
};
@@ -2352,9 +2410,9 @@ describe('Settings Loading and Merging', () => {
23522410

23532411
expect(v1Settings).toEqual({
23542412
vimMode: false,
2355-
maxSessionTurns: 0,
2413+
maxSessionTurns: -1,
23562414
includeDirectories: [],
2357-
folderTrust: null,
2415+
folderTrust: false,
23582416
});
23592417
});
23602418

packages/cli/src/config/settings.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,24 @@ function mergeSettings(
396396
]),
397397
],
398398
},
399+
experimental: {
400+
...(systemDefaults.experimental || {}),
401+
...(user.experimental || {}),
402+
...(safeWorkspaceWithoutFolderTrust.experimental || {}),
403+
...(system.experimental || {}),
404+
},
405+
contentGenerator: {
406+
...(systemDefaults.contentGenerator || {}),
407+
...(user.contentGenerator || {}),
408+
...(safeWorkspaceWithoutFolderTrust.contentGenerator || {}),
409+
...(system.contentGenerator || {}),
410+
},
411+
systemPromptMappings: {
412+
...(systemDefaults.systemPromptMappings || {}),
413+
...(user.systemPromptMappings || {}),
414+
...(safeWorkspaceWithoutFolderTrust.systemPromptMappings || {}),
415+
...(system.systemPromptMappings || {}),
416+
},
399417
extensions: {
400418
...(systemDefaults.extensions || {}),
401419
...(user.extensions || {}),

packages/cli/src/config/settingsSchema.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -746,11 +746,21 @@ export const SETTINGS_SCHEMA = {
746746
label: 'Vision Model Preview',
747747
category: 'Experimental',
748748
requiresRestart: false,
749-
default: false,
749+
default: true,
750750
description:
751751
'Enable vision model support and auto-switching functionality. When disabled, vision models like qwen-vl-max-latest will be hidden and auto-switching will not occur.',
752752
showInDialog: true,
753753
},
754+
vlmSwitchMode: {
755+
type: 'string',
756+
label: 'VLM Switch Mode',
757+
category: 'Experimental',
758+
requiresRestart: false,
759+
default: undefined as string | undefined,
760+
description:
761+
'Default behavior when images are detected in input. Values: once (one-time switch), session (switch for entire session), persist (continue with current model). If not set, user will be prompted each time. This is a temporary experimental feature.',
762+
showInDialog: false,
763+
},
754764
},
755765
},
756766

0 commit comments

Comments
 (0)