phi-4 reasoning models (#2047)

ysjprojects · shijie.yu · pre-commit-ci[bot] · web-flow · commit 241bbd60f60c · 2025-05-27T12:50:08.000+02:00
Co-authored-by: shijie.yu &lt;shijie@tensorplex.ai&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -139,6 +139,9 @@ Every model is written from scratch to maximize performance and remove layers of
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |
 | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905)                                                                            |
 | Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743)                                           |
+| Phi 4 Mini Reasoning | 3.8B | Microsoft Research | [Xu, Peng et al. 2025](https://arxiv.org/abs/2504.21233)                                           |
+| Phi 4 Reasoning | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
+| Phi 4 Reasoning Plus | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -1744,6 +1744,44 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         parallel_residual=False,
     ),
+    # https://huggingface.co/microsoft/Phi-4-reasoning/blob/main/config.json
+    dict(
+        name="Phi-4-reasoning",
+        hf_config=dict(org="microsoft", name="Phi-4-reasoning"),
+        vocab_size=100352,
+        padded_vocab_size=100352,
+        block_size=32768,
+        n_embd=5120,
+        n_layer=40,
+        n_head=40,
+        n_query_groups=10,
+        rotary_percentage=1.0,
+        bias=False,
+        norm_class_name="RMSNorm",
+        intermediate_size=17920,
+        rope_base=500000,
+        mlp_class_name="LLaMAMLP",
+        parallel_residual=False,
+    ),
+    # https://huggingface.co/microsoft/Phi-4-reasoning-plus/blob/main/config.json
+    dict(
+        name="Phi-4-reasoning-plus",
+        hf_config=dict(org="microsoft", name="Phi-4-reasoning-plus"),
+        vocab_size=100352,
+        padded_vocab_size=100352,
+        block_size=32768,
+        n_embd=5120,
+        n_layer=40,
+        n_head=40,
+        n_query_groups=10,
+        rotary_percentage=1.0,
+        bias=False,
+        norm_class_name="RMSNorm",
+        intermediate_size=17920,
+        rope_base=500000,
+        mlp_class_name="LLaMAMLP",
+        parallel_residual=False,
+    ),
     # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
     dict(
         name="Phi-4-mini-instruct",
@@ -1763,6 +1801,25 @@ def norm_class(self) -> Type:
         parallel_residual=False,
         sliding_window_size=262145,
     ),
+    # https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/config.json
+    dict(
+        name="Phi-4-mini-reasoning",
+        hf_config=dict(org="microsoft", name="Phi-4-mini-reasoning"),
+        vocab_size=200019,
+        padded_vocab_size=200064,
+        block_size=131072,
+        n_embd=3072,
+        n_layer=32,
+        n_head=24,
+        n_query_groups=8,
+        rotary_percentage=0.75,
+        bias=False,
+        norm_class_name="RMSNorm",
+        intermediate_size=8192,
+        mlp_class_name="LLaMAMLP",
+        parallel_residual=False,
+        sliding_window_size=262145,
+    ),
 ]
 configs.extend(phi)
 
diff --git a/litgpt/prompts.py b/litgpt/prompts.py
@@ -325,7 +325,35 @@ def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs: str)
 
 class Phi4(PromptStyle):
     def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs: str) -> str:
-        return f"<|im_start|>user<|im_sep|>{prompt}<|im_end|><|im_start|>assistant<|im_sep|>"
+        res = ""
+        if sys_prompt:
+            res += f"<|im_start|>system<|im_sep|>{sys_prompt}<|im_end|>"
+        res += f"<|im_start|>user<|im_sep|>{prompt}<|im_end|><|im_start|>assistant<|im_sep|>"
+        return res
+
+
+class Phi4Reasoning(PromptStyle):
+    def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs: str) -> str:
+        sys_prompt = (
+            sys_prompt
+            or "You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:"
+        )
+        return f"<|im_start>system<|im_sep|>{sys_prompt}<|im_end|><|im_start|>user<|im_sep|>{prompt}<|im_end|><|im_start|>assistant<|im_sep|>"
+
+
+class Phi4Mini(PromptStyle):
+    def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs: str) -> str:
+        res = ""
+        if sys_prompt:
+            res += f"<|system|>{sys_prompt}<|end|>"
+        res += f"<|user|>{prompt}<|end|><|assistant|>"
+        return res
+
+
+class Phi4MiniReasoning(PromptStyle):
+    def apply(self, prompt: str, *, sys_prompt: Optional[str] = None, **kwargs: str) -> str:
+        sys_prompt = sys_prompt or "Your name is Phi, an AI math expert developed by Microsoft."
+        return f"<|system|>{sys_prompt}<|end|><|user|>{prompt}<|end|><|assistant|>"
 
 
 class TinyLlama(PromptStyle):
@@ -409,6 +437,9 @@ def __init__(self):
     "phi-2": Phi2,
     "phi-3": Phi3,
     "phi-4": Phi4,
+    "phi-4-reasoning": Phi4Reasoning,
+    "phi-4-mini": Phi4Mini,
+    "phi-4-mini-reasoning": Phi4MiniReasoning,
     "tinyllama": TinyLlama,
     "gemma": Gemma,
     "llama3": Llama3,
@@ -455,6 +486,12 @@ def model_name_to_prompt_style(model_name: str) -> PromptStyle:
         return Phi2()
     if re.search("Phi-3", model_name):
         return Phi3()
+    if re.search("Phi-4-reasoning", model_name):
+        return Phi4Reasoning()
+    if re.search("Phi-4-mini-reasoning", model_name):
+        return Phi4MiniReasoning()
+    if re.search("Phi-4-mini", model_name):
+        return Phi4Mini()
     if re.search("phi-4", model_name):
         return Phi4()
     if re.search(r"tiny-llama.*chat", model_name):
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -337,7 +337,15 @@ def test_against_hf_phi(model_name, device, dtype):
 @torch.inference_mode()
 @pytest.mark.parametrize(
     "model_name",
-    ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"),
+    (
+        "Phi-3-mini-4k-instruct",
+        "Phi-3-mini-128k-instruct",
+        "Phi-3.5-mini-instruct",
+        "phi-4",
+        "Phi-4-mini-instruct",
+        "Phi-4-reasoning",
+        "Phi-4-mini-reasoning",
+    ),
 )
 @pytest.mark.parametrize(
     ("device", "dtype"),
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
@@ -37,6 +37,9 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
 | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905)                                                                            |
 | Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743)                                           |
+| Phi 4 Mini Reasoning | 3.8B | Microsoft Research | [Xu, Peng et al. 2025](https://arxiv.org/abs/2504.21233)                                           |
+| Phi 4 Reasoning | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
+| Phi 4 Reasoning Plus | 3.8B | Microsoft Research | [Abdin et al. 2025](https://arxiv.org/abs/2504.21318)                                           |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |