Skip to content

Commit 0b0c819

Browse files
Add Qwen3 2507 Thinking models (#2110)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent bf68369 commit 0b0c819

File tree

4 files changed

+83
-6
lines changed

4 files changed

+83
-6
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ Every model is written from scratch to maximize performance and remove layers of
150150
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
151151
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
152152
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
153-
| Qwen3 | 0.6B, 1.7B, 4B, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
154-
| Qwen3 MoE | 30B, 235B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
153+
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
154+
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
155155
| R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
156156
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
157157
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |

litgpt/config.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2859,6 +2859,81 @@ def norm_class(self) -> Type:
28592859
]
28602860
configs.extend(qwen_3_moe)
28612861

2862+
qwen_3_2507_thinking = [
2863+
# https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json
2864+
dict(
2865+
name="Qwen3-235B-A22B-Thinking-2507",
2866+
hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-Thinking-2507"),
2867+
block_size=262144,
2868+
head_size=128,
2869+
vocab_size=151643,
2870+
padded_vocab_size=151936,
2871+
n_layer=94,
2872+
n_head=64,
2873+
n_embd=4096,
2874+
n_query_groups=4,
2875+
rotary_percentage=1.0,
2876+
parallel_residual=False,
2877+
bias=False,
2878+
norm_class_name="RMSNorm",
2879+
mlp_class_name="LLaMAMoE",
2880+
intermediate_size=12288,
2881+
moe_intermediate_size=1536,
2882+
norm_eps=1e-6,
2883+
rope_base=5000000,
2884+
norm_qk=True,
2885+
n_expert=128,
2886+
n_expert_per_token=8,
2887+
),
2888+
# https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json
2889+
dict(
2890+
name="Qwen3-30B-A3B-Thinking-2507",
2891+
hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Thinking-2507"),
2892+
block_size=262144,
2893+
head_size=128,
2894+
vocab_size=151643,
2895+
padded_vocab_size=151936,
2896+
n_layer=48,
2897+
n_head=32,
2898+
n_embd=2048,
2899+
n_query_groups=4,
2900+
rotary_percentage=1.0,
2901+
parallel_residual=False,
2902+
bias=False,
2903+
norm_class_name="RMSNorm",
2904+
mlp_class_name="LLaMAMoE",
2905+
intermediate_size=6144,
2906+
moe_intermediate_size=768,
2907+
norm_eps=1e-6,
2908+
rope_base=10000000,
2909+
norm_qk=True,
2910+
n_expert=128,
2911+
n_expert_per_token=8,
2912+
),
2913+
# https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json
2914+
dict(
2915+
name="Qwen3-4B-Thinking-2507",
2916+
hf_config=dict(org="Qwen", name="Qwen3-4B-Thinking-2507"),
2917+
block_size=262144,
2918+
vocab_size=151643,
2919+
padded_vocab_size=151936,
2920+
n_layer=36,
2921+
n_head=32,
2922+
n_embd=2560,
2923+
n_query_groups=8,
2924+
rotary_percentage=1.0,
2925+
parallel_residual=False,
2926+
bias=False,
2927+
norm_class_name="RMSNorm",
2928+
mlp_class_name="LLaMAMLP",
2929+
intermediate_size=9728,
2930+
norm_eps=1e-6,
2931+
rope_base=5000000,
2932+
head_size=128,
2933+
norm_qk=True,
2934+
),
2935+
]
2936+
configs.extend(qwen_3_2507_thinking)
28622937

28632938
#############
28642939
# Salamandra

tests/test_model.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,7 +1082,9 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
10821082

10831083

10841084
@torch.inference_mode()
1085-
@pytest.mark.parametrize("model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B"])
1085+
@pytest.mark.parametrize(
1086+
"model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B", "Qwen3-4B-Thinking-2507"]
1087+
)
10861088
@pytest.mark.parametrize(
10871089
("device", "dtype"),
10881090
[
@@ -1141,7 +1143,7 @@ def test_against_original_qwen_3(model_name, device, dtype):
11411143

11421144

11431145
@torch.inference_mode()
1144-
@pytest.mark.parametrize("model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B"])
1146+
@pytest.mark.parametrize("model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507"])
11451147
@pytest.mark.parametrize(
11461148
("device", "dtype"),
11471149
[

tutorials/download_model_weights.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
4848
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
4949
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
5050
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
51-
| Qwen3 | 0.6B, 1.7B, 4B, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
52-
| Qwen3 MoE | 30B, 235B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
51+
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
52+
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
5353
| R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
5454
| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) |
5555
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |

0 commit comments

Comments
 (0)