QwQ-32B (#1952)

ysjprojects · Borda · pre-commit-ci[bot] · web-flow · commit 5ca893d08951 · 2025-04-03T12:45:21.000-04:00
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Jirka B &lt;j.borovec+github@gmail.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -143,7 +143,8 @@ Every model is written from scratch to maximize performance and remove layers of
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
-| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
+| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
+| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
 | R1 Distill Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |
 | Salamandra | 2B, 7B | Barcelona Supercomputing Centre | [BSC-LTC 2024](https://github.com/BSC-LTC/salamandra)                                                                         |
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -2267,11 +2267,32 @@ def norm_class(self) -> Type:
         configs.append(copy)
 
 qwq = [
+    # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json
+    dict(
+        name="QwQ-32B",
+        hf_config=dict(org="Qwen", name="QwQ-32B"),
+        block_size=131072,
+        vocab_size=151643,
+        padded_vocab_size=152064,
+        n_layer=64,
+        n_head=40,
+        n_embd=5120,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        attn_bias=True,
+        norm_class_name="RMSNorm",
+        mlp_class_name="LLaMAMLP",
+        intermediate_size=27648,
+        norm_eps=1e-5,
+        rope_base=1000000,
+    ),
     # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json
     dict(
         name="QwQ-32B-Preview",
         hf_config=dict(org="Qwen", name="QwQ-32B-Preview"),
-        block_size=131072,
+        block_size=32768,
         vocab_size=151643,
         padded_vocab_size=152064,
         n_layer=64,
diff --git a/tests/convert/test_lit_checkpoint.py b/tests/convert/test_lit_checkpoint.py
@@ -529,7 +529,9 @@ def test_check_conversion_supported_lora():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
+@pytest.mark.parametrize(
+    "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]
+)
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -800,7 +800,9 @@ def test_against_original_gemma_2(model_name, device, dtype):
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))
+@pytest.mark.parametrize(
+    "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]
+)
 @pytest.mark.parametrize(
     ("device", "dtype"),
     [
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
@@ -41,7 +41,8 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
 | Qwen2.5 Coder | 0.5B, 1.5B, 3B, 7B, 14B, 32B | Alibaba Group | [Hui, Binyuan et al. 2024](https://arxiv.org/abs/2409.12186)                                          |
 | Qwen2.5 Math | 1.5B, 7B, 72B | Alibaba Group | [An, Yang et al. 2024](https://arxiv.org/abs/2409.12122)                                          |
-| QwQ | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
+| QwQ | 32B | Alibaba Group | [Qwen Team 2025](https://qwenlm.github.io/blog/qwq-32b/)                                                                         |
+| QwQ-Preview | 32B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwq-32b-preview/)                                                                         |
 | R1 Distll Llama | 8B, 70B | DeepSeek AI | [DeepSeek AI 2025](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf)                                                                         |
 | RedPajama-INCITE | 3B, 7B | Together | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                                                 |
 | SmolLM2 | 135M, 360M, 1.7B | Hugging Face | [Hugging Face 2024](https://github.com/huggingface/smollm)                                                               |
@@ -223,6 +224,7 @@ Qwen/Qwen2.5-Math-7B
 Qwen/Qwen2.5-Math-7B-Instruct
 Qwen/Qwen2.5-Math-72B
 Qwen/Qwen2.5-Math-72B-Instruct
+Qwen/QwQ-32B
 Qwen/QwQ-32B-Preview
 stabilityai/FreeWilly2
 stabilityai/stable-code-3b

Original file line number	Diff line number	Diff line change
`@@ -529,7 +529,9 @@ def test_check_conversion_supported_lora():`
`529`	`529`
`530`	`530`
`531`	`531`	`@torch.inference_mode()`
`532`		`-@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))`
	`532`	`+@pytest.mark.parametrize(`
	`533`	`+ "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]`
	`534`	`+)`
`533`	`535`	`@pytest.mark.parametrize(`
`534`	`536`	`("device", "dtype"),`
`535`	`537`	`[`
Original file line number	Diff line number	Diff line change
`@@ -800,7 +800,9 @@ def test_against_original_gemma_2(model_name, device, dtype):`
`800`	`800`
`801`	`801`
`802`	`802`	`@torch.inference_mode()`
`803`		`-@pytest.mark.parametrize("model_name", ("Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview"))`
	`803`	`+@pytest.mark.parametrize(`
	`804`	`+ "model_name", ["Qwen2.5-1.5B", "Qwen2.5-Coder-1.5B", "Qwen2.5-Math-1.5B", "QwQ-32B-Preview", "QwQ-32B"]`
	`805`	`+)`
`804`	`806`	`@pytest.mark.parametrize(`
`805`	`807`	`("device", "dtype"),`
`806`	`808`	`[`