[model] Support Qwen3-VL 2B/32B (#6259)

Jintao-Huang · Jintao-Huang · commit f27bf3a40f84 · 2025-10-24T13:42:10.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -709,6 +709,10 @@
 |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|&#x2714;|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|[Qwen/Qwen3-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)|
+|[Qwen/Qwen3-VL-2B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking)|
+|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-4B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)|
 |[Qwen/Qwen3-VL-4B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking)|
 |[Qwen/Qwen3-VL-4B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-FP8)|
@@ -717,6 +721,10 @@
 |[Qwen/Qwen3-VL-8B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking)|
 |[Qwen/Qwen3-VL-8B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8)|
 |[Qwen/Qwen3-VL-8B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking-FP8)|
+|[Qwen/Qwen3-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct)|
+|[Qwen/Qwen3-VL-32B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking)|
+|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
 |[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
 |[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -709,6 +709,10 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|&#x2714;|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|[Qwen/Qwen3-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)|
+|[Qwen/Qwen3-VL-2B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking)|
+|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-4B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)|
 |[Qwen/Qwen3-VL-4B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking)|
 |[Qwen/Qwen3-VL-4B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-FP8)|
@@ -717,6 +721,10 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen3-VL-8B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking)|
 |[Qwen/Qwen3-VL-8B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8)|
 |[Qwen/Qwen3-VL-8B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking-FP8)|
+|[Qwen/Qwen3-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct)|
+|[Qwen/Qwen3-VL-32B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking)|
+|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
 |[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
 |[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -1068,6 +1068,10 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
     ModelMeta(
         MLLMModelType.qwen3_vl, [
             ModelGroup([
+                Model('Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-VL-2B-Instruct'),
+                Model('Qwen/Qwen3-VL-2B-Thinking', 'Qwen/Qwen3-VL-2B-Thinking'),
+                Model('Qwen/Qwen3-VL-2B-Instruct-FP8', 'Qwen/Qwen3-VL-2B-Instruct-FP8'),
+                Model('Qwen/Qwen3-VL-2B-Thinking-FP8', 'Qwen/Qwen3-VL-2B-Thinking-FP8'),
                 Model('Qwen/Qwen3-VL-4B-Instruct', 'Qwen/Qwen3-VL-4B-Instruct'),
                 Model('Qwen/Qwen3-VL-4B-Thinking', 'Qwen/Qwen3-VL-4B-Thinking'),
                 Model('Qwen/Qwen3-VL-4B-Instruct-FP8', 'Qwen/Qwen3-VL-4B-Instruct-FP8'),
@@ -1076,6 +1080,10 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
                 Model('Qwen/Qwen3-VL-8B-Thinking', 'Qwen/Qwen3-VL-8B-Thinking'),
                 Model('Qwen/Qwen3-VL-8B-Instruct-FP8', 'Qwen/Qwen3-VL-8B-Instruct-FP8'),
                 Model('Qwen/Qwen3-VL-8B-Thinking-FP8', 'Qwen/Qwen3-VL-8B-Thinking-FP8'),
+                Model('Qwen/Qwen3-VL-32B-Instruct', 'Qwen/Qwen3-VL-32B-Instruct'),
+                Model('Qwen/Qwen3-VL-32B-Thinking', 'Qwen/Qwen3-VL-32B-Thinking'),
+                Model('Qwen/Qwen3-VL-32B-Instruct-FP8', 'Qwen/Qwen3-VL-32B-Instruct-FP8'),
+                Model('Qwen/Qwen3-VL-32B-Thinking-FP8', 'Qwen/Qwen3-VL-32B-Thinking-FP8'),
             ]),
         ],
         TemplateType.qwen3_vl,