Skip to content

Commit 5c4c09f

Browse files
ysjprojectsBordalantigapre-commit-ci[bot]
authored
Phi4 mini (#1949)
Co-authored-by: Jirka Borovec <[email protected]> Co-authored-by: Luca Antiga <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent dc4c0dc commit 5c4c09f

File tree

6 files changed

+34
-6
lines changed

6 files changed

+34
-6
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ Every model is written from scratch to maximize performance and remove layers of
137137
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
138138
| Phi 3 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219) |
139139
| Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) |
140+
| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) |
140141
| Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) |
141142
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
142143
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |

litgpt/config.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,6 +1601,26 @@ def norm_class(self) -> Type:
16011601
mlp_class_name="LLaMAMLP",
16021602
parallel_residual=False,
16031603
),
1604+
# https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
1605+
dict(
1606+
name="Phi-4-mini-instruct",
1607+
hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"),
1608+
vocab_size=200019,
1609+
padded_vocab_size=200064,
1610+
block_size=131072,
1611+
n_embd=3072,
1612+
n_layer=32,
1613+
n_head=24,
1614+
n_query_groups=8,
1615+
rotary_percentage=0.75,
1616+
bias=False,
1617+
norm_class_name="RMSNorm",
1618+
intermediate_size=8192,
1619+
mlp_class_name="LLaMAMLP",
1620+
parallel_residual=False,
1621+
sliding_window_size=262145,
1622+
sliding_window_layer_placing="all",
1623+
),
16041624
]
16051625
configs.extend(phi)
16061626

litgpt/scripts/convert_hf_checkpoint.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def copy_weights_phi(
323323
"lm_head.bias": "lm_head.bias",
324324
}
325325

326-
if config.name.startswith(("Phi-3", "phi-4")):
326+
if config.name.startswith(("Phi-3", "phi-4", "Phi-4")):
327327
weight_map.update(
328328
{
329329
"model.layers.{}.self_attn.qkv_proj.weight": "transformer.h.{}.attn.qkv.weight",
@@ -361,6 +361,9 @@ def copy_weights_phi(
361361
if progress_per_file is not None:
362362
pbar.update(progress_per_file)
363363

364+
if "lm_head.weight" not in state_dict and config.name.startswith("Phi-4"):
365+
state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
366+
364367
for i in list(qkv_weights):
365368
for weight_type in list(qkv_weights[i]):
366369
qkv = qkv_weights[i][weight_type]
@@ -606,6 +609,5 @@ def convert_hf_checkpoint(
606609
for bin_file in sorted(bin_files):
607610
hf_weights = load_safetensors(bin_file) if bin_file.suffix == ".safetensors" else lazy_load(bin_file)
608611
copy_fn(sd, hf_weights, saver=saver, dtype=dtype, debug_mode=debug_mode)
609-
610612
print(f"Saving converted checkpoint to {checkpoint_dir}")
611613
saver.save(sd)

litgpt/scripts/convert_lit_checkpoint.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def copy_weights_phi(
236236
"lm_head.weight": "lm_head.weight",
237237
"lm_head.bias": "lm_head.bias",
238238
}
239-
if config.name.startswith(("Phi-3", "phi-4")):
239+
if config.name.lower().startswith(("phi-3", "phi-4")):
240240
weight_map.update(
241241
{
242242
"transformer.h.{}.attn.qkv.weight": "model.layers.{}.self_attn.qkv_proj.weight",
@@ -249,10 +249,12 @@ def copy_weights_phi(
249249
gate_up_proj_weights = defaultdict(dict)
250250

251251
for from_name, param in lit_weights.items():
252+
if from_name == "lm_head.weight" and config.name.startswith("Phi-4"):
253+
continue
252254
name_template, layer_idx = layer_template(from_name)
253255
param = load_param(param, from_name, None)
254256
if from_name.endswith((".attn.qkv.weight", ".attn.qkv.bias")):
255-
if config.name.startswith("Phi-3"):
257+
if config.name.lower().startswith(("phi-3", "phi-4")):
256258
to_names = (weight_map[name_template].format(layer_idx),)
257259
params = (param,)
258260
else:
@@ -282,7 +284,7 @@ def copy_weights_phi(
282284
param = saver.store_early(param)
283285
state_dict[to_name] = param
284286

285-
if config.name.startswith("Phi-3"):
287+
if config.name.lower().startswith(("phi-3", "phi-4")):
286288
for layer_idx in list(gate_up_proj_weights):
287289
fc_1_weight = gate_up_proj_weights[layer_idx]["fc_1"]
288290
fc_2_weight = gate_up_proj_weights[layer_idx]["fc_2"]

tests/test_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,8 @@ def test_against_hf_phi(model_name, device, dtype):
332332

333333
@torch.inference_mode()
334334
@pytest.mark.parametrize(
335-
"model_name", ("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4")
335+
"model_name",
336+
("Phi-3-mini-4k-instruct", "Phi-3-mini-128k-instruct", "Phi-3.5-mini-instruct", "phi-4", "Phi-4-mini-instruct"),
336337
)
337338
@pytest.mark.parametrize(
338339
("device", "dtype"),

tutorials/download_model_weights.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ LitGPT supports a variety of LLM architectures with publicly available weights.
3535
| Phi 1.5 & 2 | 1.3B, 2.7B | Microsoft Research | [Li et al. 2023](https://arxiv.org/abs/2309.05463) |
3636
| Phi 3 & 3.5 | 3.8B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2404.14219)
3737
| Phi 4 | 14B | Microsoft Research | [Abdin et al. 2024](https://arxiv.org/abs/2412.08905) |
38+
| Phi 4 Mini Instruct | 3.8B | Microsoft Research | [Microsoft 2025](https://arxiv.org/abs/2503.01743) |
3839
| Platypus | 7B, 13B, 70B | Lee et al. | [Lee, Hunter, and Ruiz 2023](https://arxiv.org/abs/2308.07317) |
3940
| Pythia | {14,31,70,160,410}M, {1,1.4,2.8,6.9,12}B | EleutherAI | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) |
4041
| Qwen2.5 | 0.5B, 1.5B, 3B, 7B, 14B, 32B, 72B | Alibaba Group | [Qwen Team 2024](https://qwenlm.github.io/blog/qwen2.5/) |
@@ -170,6 +171,7 @@ microsoft/Phi-3-mini-128k-instruct
170171
microsoft/Phi-3-mini-4k-instruct
171172
microsoft/Phi-3.5-mini-instruct
172173
microsoft/phi-4
174+
microsoft/Phi-4-mini-instruct
173175
mistralai/mathstral-7B-v0.1
174176
mistralai/Mistral-7B-Instruct-v0.1
175177
mistralai/Mistral-7B-Instruct-v0.2

0 commit comments

Comments
 (0)