Phi4 mini (#1949)

ysjprojects · Borda · lantiga · web-flow · commit 5c4c09f84e78 · 2025-04-03T00:54:56.000Z
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Luca Antiga &lt;luca@lightning.ai&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -137,6 +137,7 @@ Every model is written from scratch to maximize performance and remove layers of
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                  |
 | Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)                                                                            |
 | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905)                                                                            |
+| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743)                                           |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -1601,6 +1601,26 @@ def norm_class(self) -> Type:
         mlp_class_name="LLaMAMLP",
         parallel_residual=False,
     ),
+    # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
+    dict(
+        name="Phi-4-mini-instruct",
+        hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"),
+        vocab_size=200019,
+        padded_vocab_size=200064,
+        block_size=131072,
+        n_embd=3072,
+        n_layer=32,
+        n_head=24,
+        n_query_groups=8,
+        rotary_percentage=0.75,
+        bias=False,
+        norm_class_name="RMSNorm",
+        intermediate_size=8192,
+        mlp_class_name="LLaMAMLP",
+        parallel_residual=False,
+        sliding_window_size=262145,
+        sliding_window_layer_placing="all",
+    ),
 ]
 configs.extend(phi)
 
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -323,7 +323,7 @@ def copy_weights_phi(
         "lm_head.bias": "lm_head.bias",
     }
 
-    if config.name.startswith(("Phi-3", "phi-4")):
+    if config.name.startswith(("Phi-3", "phi-4", "Phi-4")):
         weight_map.update(
             {
                 "model.layers.{}.self_attn.qkv_proj.weight": "transformer.h.{}.attn.qkv.weight",
@@ -361,6 +361,9 @@ def copy_weights_phi(
         if progress_per_file is not None:
             pbar.update(progress_per_file)
 
+    if "lm_head.weight" not in state_dict and config.name.startswith("Phi-4"):
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
     for i in list(qkv_weights):
         for weight_type in list(qkv_weights[i]):
             qkv = qkv_weights[i][weight_type]
@@ -606,6 +609,5 @@ def convert_hf_checkpoint(
             for bin_file in sorted(bin_files):
                 hf_weights = load_safetensors(bin_file) if bin_file.suffix == ".safetensors" else lazy_load(bin_file)
                 copy_fn(sd, hf_weights, saver=saver, dtype=dtype, debug_mode=debug_mode)
-
         print(f"Saving converted checkpoint to {checkpoint_dir}")
         saver.save(sd)
diff --git a/litgpt/scripts/convert_lit_checkpoint.py b/litgpt/scripts/convert_lit_checkpoint.py
@@ -236,7 +236,7 @@ def copy_weights_phi(
         "lm_head.weight": "lm_head.weight",
         "lm_head.bias": "lm_head.bias",
     }
-    if config.name.startswith(("Phi-3", "phi-4")):
+    if config.name.lower().startswith(("phi-3", "phi-4")):
         weight_map.update(
             {
                 "transformer.h.{}.attn.qkv.weight": "model.layers.{}.self_attn.qkv_proj.weight",
@@ -249,10 +249,12 @@ def copy_weights_phi(
         gate_up_proj_weights = defaultdict(dict)
 
     for from_name, param in lit_weights.items():
+        if from_name == "lm_head.weight" and config.name.startswith("Phi-4"):
+            continue
         name_template, layer_idx = layer_template(from_name)
         param = load_param(param, from_name, None)
         if from_name.endswith((".attn.qkv.weight", ".attn.qkv.bias")):
-            if config.name.startswith("Phi-3"):
+            if config.name.lower().startswith(("phi-3", "phi-4")):
                 to_names = (weight_map[name_template].format(layer_idx),)
                 params = (param,)
             else:
@@ -282,7 +284,7 @@ def copy_weights_phi(
                 param = saver.store_early(param)
             state_dict[to_name] = param
 
-    if config.name.startswith("Phi-3"):
+    if config.name.lower().startswith(("phi-3", "phi-4")):
         for layer_idx in list(gate_up_proj_weights):
             fc_1_weight = gate_up_proj_weights[layer_idx]["fc_1"]
             fc_2_weight = gate_up_proj_weights[layer_idx]["fc_2"]
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -332,7 +332,8 @@ def test_against_hf_phi(model_name, device, dtype):
 
 @torch.inference_mode()
 @pytest.mark.parametrize(
-    "model_name", ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4")
+    "model_name",
+    ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"),
 )
 @pytest.mark.parametrize(
     ("device", "dtype"),
diff --git a/tutorials/download_model_weights.md b/tutorials/download_model_weights.md
@@ -35,6 +35,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
 | Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research  | [Li et al. 2023](https://arxiv.org/abs/2309.05463)                                                                          |
 | Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
 | Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905)                                                                            |
+| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743)                                           |
 | Platypus | 7B, 13B, 70B |  Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317)                                                               |
 | Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                            |
 | Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/)                                               |
@@ -170,6 +171,7 @@ microsoft/Phi-3-mini-128k-instruct
 microsoft/Phi-3-mini-4k-instruct
 microsoft/Phi-3.5-mini-instruct
 microsoft/phi-4
+microsoft/Phi-4-mini-instruct
 mistralai/mathstral-7B-v0.1
 mistralai/Mistral-7B-Instruct-v0.1
 mistralai/Mistral-7B-Instruct-v0.2

Original file line number	Diff line number	Diff line change
`@@ -332,7 +332,8 @@ def test_against_hf_phi(model_name, device, dtype):`
`332`	`332`
`333`	`333`	`@torch.inference_mode()`
`334`	`334`	`@pytest.mark.parametrize(`
`335`		`- "model_name", ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4")`
	`335`	`+ "model_name",`
	`336`	`+ ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"),`
`336`	`337`	`)`
`337`	`338`	`@pytest.mark.parametrize(`
`338`	`339`	`("device", "dtype"),`