Skip to content

Commit f27bf3a

Browse files
committed
[model] Support Qwen3-VL 2B/32B (#6259)
1 parent a7432ad commit f27bf3a

File tree

3 files changed

+24
-0
lines changed

3 files changed

+24
-0
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,10 @@
709709
|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
710710
|[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
711711
|[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
712+
|[Qwen/Qwen3-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)|
713+
|[Qwen/Qwen3-VL-2B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking)|
714+
|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8)|
715+
|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking-FP8)|
712716
|[Qwen/Qwen3-VL-4B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)|
713717
|[Qwen/Qwen3-VL-4B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking)|
714718
|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-FP8)|
@@ -717,6 +721,10 @@
717721
|[Qwen/Qwen3-VL-8B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking)|
718722
|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8)|
719723
|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking-FP8)|
724+
|[Qwen/Qwen3-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct)|
725+
|[Qwen/Qwen3-VL-32B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking)|
726+
|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct-FP8)|
727+
|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking-FP8)|
720728
|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
721729
|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
722730
|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,10 @@ The table below introduces the models integrated with ms-swift:
709709
|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|&#x2714;|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
710710
|[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
711711
|[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
712+
|[Qwen/Qwen3-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)|
713+
|[Qwen/Qwen3-VL-2B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-2B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking)|
714+
|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct-FP8)|
715+
|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-2B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-2B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-2B-Thinking-FP8)|
712716
|[Qwen/Qwen3-VL-4B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct)|
713717
|[Qwen/Qwen3-VL-4B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-4B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-4B-Thinking)|
714718
|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-4B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-4B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-FP8)|
@@ -717,6 +721,10 @@ The table below introduces the models integrated with ms-swift:
717721
|[Qwen/Qwen3-VL-8B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-8B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking)|
718722
|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct-FP8)|
719723
|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-8B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-8B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-8B-Thinking-FP8)|
724+
|[Qwen/Qwen3-VL-32B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct)|
725+
|[Qwen/Qwen3-VL-32B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-32B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking)|
726+
|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Instruct-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct-FP8)|
727+
|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-32B-Thinking-FP8)|qwen3_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-32B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-32B-Thinking-FP8)|
720728
|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
721729
|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
722730
|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|

swift/llm/model/model/qwen.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,10 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
10681068
ModelMeta(
10691069
MLLMModelType.qwen3_vl, [
10701070
ModelGroup([
1071+
Model('Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-VL-2B-Instruct'),
1072+
Model('Qwen/Qwen3-VL-2B-Thinking', 'Qwen/Qwen3-VL-2B-Thinking'),
1073+
Model('Qwen/Qwen3-VL-2B-Instruct-FP8', 'Qwen/Qwen3-VL-2B-Instruct-FP8'),
1074+
Model('Qwen/Qwen3-VL-2B-Thinking-FP8', 'Qwen/Qwen3-VL-2B-Thinking-FP8'),
10711075
Model('Qwen/Qwen3-VL-4B-Instruct', 'Qwen/Qwen3-VL-4B-Instruct'),
10721076
Model('Qwen/Qwen3-VL-4B-Thinking', 'Qwen/Qwen3-VL-4B-Thinking'),
10731077
Model('Qwen/Qwen3-VL-4B-Instruct-FP8', 'Qwen/Qwen3-VL-4B-Instruct-FP8'),
@@ -1076,6 +1080,10 @@ def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
10761080
Model('Qwen/Qwen3-VL-8B-Thinking', 'Qwen/Qwen3-VL-8B-Thinking'),
10771081
Model('Qwen/Qwen3-VL-8B-Instruct-FP8', 'Qwen/Qwen3-VL-8B-Instruct-FP8'),
10781082
Model('Qwen/Qwen3-VL-8B-Thinking-FP8', 'Qwen/Qwen3-VL-8B-Thinking-FP8'),
1083+
Model('Qwen/Qwen3-VL-32B-Instruct', 'Qwen/Qwen3-VL-32B-Instruct'),
1084+
Model('Qwen/Qwen3-VL-32B-Thinking', 'Qwen/Qwen3-VL-32B-Thinking'),
1085+
Model('Qwen/Qwen3-VL-32B-Instruct-FP8', 'Qwen/Qwen3-VL-32B-Instruct-FP8'),
1086+
Model('Qwen/Qwen3-VL-32B-Thinking-FP8', 'Qwen/Qwen3-VL-32B-Thinking-FP8'),
10791087
]),
10801088
],
10811089
TemplateType.qwen3_vl,

0 commit comments

Comments
 (0)