qwen2.5 long context (Lightning-AI#1933)

ysjprojects · Borda · pre-commit-ci[bot] · mseeger · commit c991533c6b0b · 2025-07-04T17:05:42.000+02:00
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: KaelanDt &lt;kaelandonatella@gmail.com&gt;
Co-authored-by: shijie.yu &lt;shijie@tensorplex.ai&gt;
diff --git a/README.md b/README.md
@@ -146,6 +146,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
+| Qwen2.5 1M (Long Context) | 7B, 14B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwen2.5-1m/)                                          |
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -2330,6 +2330,53 @@ def norm_class(self) -> Type:
     ),
 ]
 
+qwen_2_5_1m = [
+    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
+    dict(
+        name="Qwen2.5-7B-Instruct-1M",
+        hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"),
+        block_size=1010000,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=28,
+        n_head=28,
+        n_embd=3584,
+        n_query_groups=4,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=18944,
+        norm_eps=1e-5,
+        rope_base=10000000,
+    ),
+    # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
+    dict(
+        name="Qwen2.5-14B-Instruct-1M",
+        hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"),
+        block_size=1010000,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=48,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=13824,
+        norm_eps=1e-5,
+        rope_base=10000000,
+    ),
+]
+
+qwen_2_5.extend(qwen_2_5_1m)
+
 qwen_2_5_coder = [
     # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
     dict(
diff --git a/tests/convert/test_lit_checkpoint.py b/tests/convert/test_lit_checkpoint.py
@@ -605,7 +605,15 @@ def test_check_conversion_supported_lora():
 
 @torch.inference_mode()
 @pytest.mark.parametrize(
-    "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]
+    "model_name",
+    (
+        "Qwen2.5-1.5B",
+        "Qwen2.5-Coder-1.5B",
+        "Qwen2.5-Math-1.5B",
+        "QwQ-32B-Preview",
+        "QwQ-32B",
+        "Qwen2.5-7B-Instruct-1M",
+    ),
 )
 @pytest.mark.parametrize(
     ("device", "dtype"),
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
@@ -44,6 +44,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
+| Qwen2.5 1M (Long Context) | 7B, 14B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwen2.5-1m/)                                          |
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
 | QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
 | QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
@@ -209,8 +210,10 @@ Qwen/Qwen2.5-3B
 Qwen/Qwen2.5-3B-Instruct
 Qwen/Qwen2.5-7B
 Qwen/Qwen2.5-7B-Instruct
+Qwen/Qwen2.5-7B-Instruct-1M
 Qwen/Qwen2.5-14B
 Qwen/Qwen2.5-14B-Instruct
+Qwen/Qwen2.5-14B-Instruct-1M
 Qwen/Qwen2.5-32B
 Qwen/Qwen2.5-32B-Instruct
 Qwen/Qwen2.5-72B