Skip to content

Commit fe3ec59

Browse files
Add Qwen3 2507 Instruct models (#2125)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent db8e878 commit fe3ec59

File tree

4 files changed

+51
-14
lines changed

4 files changed

+51
-14
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ Every model is written from scratch to maximize performance and remove layers of
150150
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
151151
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
152152
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
153-
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
154-
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
153+
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
154+
| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
155155
| R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
156156
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
157157
| Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra) |

litgpt/config.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2877,11 +2877,11 @@ def norm_class(self) -> Type:
28772877
]
28782878
configs.extend(qwen_3_moe)
28792879

2880-
qwen_3_2507_thinking = [
2880+
qwen_3_2507_thinking_instruct = [
28812881
# https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json
28822882
dict(
2883-
name="Qwen3-235B-A22B-Thinking-2507",
2884-
hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-Thinking-2507"),
2883+
name="Qwen3-235B-A22B-{}-2507",
2884+
hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"),
28852885
block_size=262144,
28862886
head_size=128,
28872887
vocab_size=151643,
@@ -2905,8 +2905,8 @@ def norm_class(self) -> Type:
29052905
),
29062906
# https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json
29072907
dict(
2908-
name="Qwen3-30B-A3B-Thinking-2507",
2909-
hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Thinking-2507"),
2908+
name="Qwen3-30B-A3B-{}-2507",
2909+
hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"),
29102910
block_size=262144,
29112911
head_size=128,
29122912
vocab_size=151643,
@@ -2930,8 +2930,8 @@ def norm_class(self) -> Type:
29302930
),
29312931
# https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json
29322932
dict(
2933-
name="Qwen3-4B-Thinking-2507",
2934-
hf_config=dict(org="Qwen", name="Qwen3-4B-Thinking-2507"),
2933+
name="Qwen3-4B-{}-2507",
2934+
hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"),
29352935
block_size=262144,
29362936
vocab_size=151643,
29372937
padded_vocab_size=151936,
@@ -2951,7 +2951,13 @@ def norm_class(self) -> Type:
29512951
norm_qk=True,
29522952
),
29532953
]
2954-
configs.extend(qwen_3_2507_thinking)
2954+
2955+
for c in qwen_3_2507_thinking_instruct:
2956+
for kind in ("Thinking", "Instruct"):
2957+
copy = deepcopy(c)
2958+
copy["name"] = c["name"].format(kind)
2959+
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
2960+
configs.append(copy)
29552961

29562962
#############
29572963
# Salamandra

tests/test_model.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,7 +1083,16 @@ def test_against_original_qwen_2_5(model_name, device, dtype):
10831083

10841084
@torch.inference_mode()
10851085
@pytest.mark.parametrize(
1086-
"model_name", ["Qwen3-0.6B", "Qwen3-8B", "Qwen3-4B-Base", "Qwen3-14B-Base", "Qwen3-32B", "Qwen3-4B-Thinking-2507"]
1086+
"model_name",
1087+
[
1088+
"Qwen3-0.6B",
1089+
"Qwen3-8B",
1090+
"Qwen3-4B-Base",
1091+
"Qwen3-14B-Base",
1092+
"Qwen3-32B",
1093+
"Qwen3-4B-Thinking-2507",
1094+
"Qwen3-4B-Instruct-2507",
1095+
],
10871096
)
10881097
@pytest.mark.parametrize(
10891098
("device", "dtype"),
@@ -1143,7 +1152,9 @@ def test_against_original_qwen_3(model_name, device, dtype):
11431152

11441153

11451154
@torch.inference_mode()
1146-
@pytest.mark.parametrize("model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507"])
1155+
@pytest.mark.parametrize(
1156+
"model_name", ["Qwen3-30B-A3B", "Qwen3-235B-A22B", "Qwen3-235B-A22B-Thinking-2507", "Qwen3-235B-A22B-Instruct-2507"]
1157+
)
11471158
@pytest.mark.parametrize(
11481159
("device", "dtype"),
11491160
[

tutorials/download_model_weights.md

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
4848
| Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122) |
4949
| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/) |
5050
| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/) |
51-
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
52-
| Qwen3 MoE | 30B{Hybrid, Thinking-2507}, 235B{Hybrid, Thinking-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
51+
| Qwen3 | 0.6B, 1.7B, 4B{Hybrid, Thinking-2507, Instruct-2507}, 8B, 14B, 32B | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
52+
| Qwen3 MoE | 30B{Hybrid, Thinking-2507, Instruct-2507}, 235B{Hybrid, Thinking-2507, Instruct-2507} | Alibaba Group | [Qwen Team 2025](https://arxiv.org/abs/2505.09388/) |
5353
| R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) |
5454
| RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1) |
5555
| SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm) |
@@ -237,6 +237,26 @@ Qwen/Qwen2.5-Math-7B
237237
Qwen/Qwen2.5-Math-7B-Instruct
238238
Qwen/Qwen2.5-Math-72B
239239
Qwen/Qwen2.5-Math-72B-Instruct
240+
Qwen/Qwen3-0.6B
241+
Qwen/Qwen3-0.6B-Base
242+
Qwen/Qwen3-1.7B
243+
Qwen/Qwen3-1.7B-Base
244+
Qwen/Qwen3-4B
245+
Qwen/Qwen3-4B-Base
246+
Qwen/Qwen3-8B
247+
Qwen/Qwen3-8B-Base
248+
Qwen/Qwen3-14B
249+
Qwen/Qwen3-14B-Base
250+
Qwen/Qwen3-32B
251+
Qwen/Qwen3-30B-A3B
252+
Qwen/Qwen3-30B-A3B-Base
253+
Qwen/Qwen3-235B-A22B
254+
Qwen/Qwen3-4B-Thinking-2507
255+
Qwen/Qwen3-4B-Instruct-2507
256+
Qwen/Qwen3-30B-A3B-Thinking-2507
257+
Qwen/Qwen3-30B-A3B-Instruct-2507
258+
Qwen/Qwen3-235B-A22B-Thinking-2507
259+
Qwen/Qwen3-235B-A22B-Instruct-2507
240260
Qwen/QwQ-32B
241261
Qwen/QwQ-32B-Preview
242262
stabilityai/FreeWilly2

0 commit comments

Comments
 (0)