Skip to content

Commit b1dbace

Browse files
committed
[model] Support qwen3_next (transformers) (#5782)
1 parent e47b08d commit b1dbace

File tree

14 files changed

+45
-1
lines changed

14 files changed

+45
-1
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@
231231
|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
232232
|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
233233
|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|-|
234+
|[Qwen/Qwen3-Next-80B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct)|qwen3_next|qwen3_nothinking|transformers>=4.57.0.dev|✘|-|-|
235+
|[Qwen/Qwen3-Next-80B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Thinking)|qwen3_next_thinking|qwen3_thinking|transformers>=4.57.0.dev|✘|-|-|
234236
|[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
235237
|[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
236238
|[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ The table below introduces the models integrated with ms-swift:
231231
|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✔|-|[Qwen/Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)|
232232
|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://modelscope.cn/models/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|[Qwen/Qwen3-235B-A22B-Thinking-2507-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507-FP8)|
233233
|[swift/Qwen3-235B-A22B-Thinking-2507-AWQ](https://modelscope.cn/models/swift/Qwen3-235B-A22B-Thinking-2507-AWQ)|qwen3_moe_thinking|qwen3_thinking|transformers>=4.51|✘|-|-|
234+
|[Qwen/Qwen3-Next-80B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct)|qwen3_next|qwen3_nothinking|transformers>=4.57.0.dev|✘|-|-|
235+
|[Qwen/Qwen3-Next-80B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Thinking)|qwen3_next_thinking|qwen3_thinking|transformers>=4.57.0.dev|✘|-|-|
234236
|[Qwen/Qwen3-Embedding-0.6B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)|
235237
|[Qwen/Qwen3-Embedding-4B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-4B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-4B](https://huggingface.co/Qwen/Qwen3-Embedding-4B)|
236238
|[Qwen/Qwen3-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-Embedding-8B)|qwen3_emb|qwen3_emb|-|✘|-|[Qwen/Qwen3-Embedding-8B](https://huggingface.co/Qwen/Qwen3-Embedding-8B)|

swift/llm/model/constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class LLMModelType:
1717
qwen3_nothinking = 'qwen3_nothinking'
1818
qwen3_moe = 'qwen3_moe'
1919
qwen3_moe_thinking = 'qwen3_moe_thinking'
20+
qwen3_next = 'qwen3_next'
21+
qwen3_next_thinking = 'qwen3_next_thinking'
2022
qwen3_emb = 'qwen3_emb'
2123
qwen3_reranker = 'qwen3_reranker'
2224

swift/llm/model/model/qwen.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,26 @@ def _get_cast_dtype(self) -> torch.dtype:
624624
requires=['transformers>=4.51'],
625625
))
626626

627+
register_model(
628+
ModelMeta(
629+
LLMModelType.qwen3_next,
630+
[ModelGroup([Model('Qwen/Qwen3-Next-80B-A3B-Instruct')])],
631+
TemplateType.qwen3_nothinking,
632+
get_model_tokenizer_with_flash_attn,
633+
architectures=['Qwen3NextForCausalLM'],
634+
requires=['transformers>=4.57.0.dev'],
635+
))
636+
637+
register_model(
638+
ModelMeta(
639+
LLMModelType.qwen3_next_thinking,
640+
[ModelGroup([Model('Qwen/Qwen3-Next-80B-A3B-Thinking')])],
641+
TemplateType.qwen3_thinking,
642+
get_model_tokenizer_with_flash_attn,
643+
architectures=['Qwen3NextForCausalLM'],
644+
requires=['transformers>=4.57.0.dev'],
645+
))
646+
627647

628648
def patch_qwen_vl_utils(vision_process):
629649
if hasattr(vision_process, '_patch'):

swift/megatron/model/gpt/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
from typing import Any, Dict
23

34
from ..config import convert_hf_config
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
from . import glm, internvl, qwen

swift/megatron/model/mm_gpt/glm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
from megatron.training import get_args
23

34
from swift.llm import ModelType, Template

swift/megatron/model/mm_gpt/internvl.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
import torch
23

34
from swift.llm import ModelType

swift/megatron/model/mm_gpt/qwen.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
import torch
23
from megatron.training import get_args, get_tokenizer
34
from PIL import Image

swift/megatron/model/mm_gpt/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# Copyright (c) Alibaba, Inc. and its affiliates.
12
from abc import ABC, abstractmethod
23
from contextlib import contextmanager
34
from dataclasses import dataclass

0 commit comments

Comments
 (0)