Support Phi-4 model

ai-edge-bot · copybara-github · commit 29311c6056bd · 2025-02-28T13:03:18.000-08:00
PiperOrigin-RevId: 732233721
diff --git a/ai_edge_torch/generative/examples/README.md b/ai_edge_torch/generative/examples/README.md
@@ -26,13 +26,14 @@ found [here](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/tree/main).
 ## TinyLlama
 [TinyLlama](https://github.com/jzhang38/TinyLlama) is a popular OSS smaller version of Meta's Llama2 model, with only 1.1B parameters. [HuggingFace checkpoint](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).
 
-## Microsoft Phi-2 and 3.5-mini
-Microsoft Phi-2 and Phi-3.5-mini are also decoder-only LLMs with 2.7B and 3.82B
-parameters each. See details on
-[Kaggle](https://www.kaggle.com/models/Microsoft/phi/transformers/2) for Phi-2
-and [HuggingFace](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) for
-Phi-3.5-mini. Note that the example of Phi-3.5-mini supports up to 4K tokens,
-not to 128K tokens which the original Phi-3.5 supports.
+## Microsoft Phi-2, 3.5-mini, and 4-mini
+Microsoft Phi-2, Phi-3.5-mini and Phi-4-mini are also decoder-only LLMs with
+2.7B, 3.82B and 3.84B parameters each. See details on
+[Kaggle](https://www.kaggle.com/models/Microsoft/phi/transformers/2) for Phi-2,
+[HuggingFace](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) for Phi-3.5-mini,
+and [HuggingFace](https://huggingface.co/microsoft/Phi-4-mini-instruct) for Phi-4-mini.
+Note that the example of Phi-3.5-mini and Phi-4-mini supports up to 4K tokens,
+not to 128K tokens which the original models support.
 
 ## Apple OpenELM
 [Apple OpenELM](https://huggingface.co/apple/OpenELM) is also a decoder-only LLM
diff --git a/ai_edge_torch/generative/examples/phi/convert_phi4_to_tflite.py b/ai_edge_torch/generative/examples/phi/convert_phi4_to_tflite.py
@@ -0,0 +1,80 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Example of converting a Phi-4 model to multi-signature tflite model."""
+
+import os
+import pathlib
+
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.phi import phi4
+from ai_edge_torch.generative.utilities import converter
+from ai_edge_torch.generative.utilities.model_builder import ExportConfig
+
+_CHECKPOINT_PATH = flags.DEFINE_string(
+    'checkpoint_path',
+    os.path.join(pathlib.Path.home(), 'Downloads/llm_data/phi4'),
+    'The path to the model checkpoint, or directory holding the checkpoint.',
+)
+_OUTPUT_PATH = flags.DEFINE_string(
+    'output_path',
+    '/tmp/',
+    'The path to export the tflite model.',
+)
+_OUTPUT_NAME_PREFIX = flags.DEFINE_string(
+    'output_name_prefix',
+    'phi4',
+    'The prefix of the output tflite model name.',
+)
+_PREFILL_SEQ_LENS = flags.DEFINE_multi_integer(
+    'prefill_seq_lens',
+    (8, 64, 128, 256, 512, 1024),
+    'List of the maximum sizes of prefill input tensors.',
+)
+_KV_CACHE_MAX_LEN = flags.DEFINE_integer(
+    'kv_cache_max_len',
+    1280,
+    'The maximum size of KV cache buffer, including both prefill and decode.',
+)
+_QUANTIZE = flags.DEFINE_bool(
+    'quantize',
+    True,
+    'Whether the model should be quantized.',
+)
+_LORA_RANKS = flags.DEFINE_multi_integer(
+    'lora_ranks',
+    None,
+    'If set, the model will be converted with the provided list of LoRA ranks.',
+)
+
+
+def main(_):
+  pytorch_model = phi4.build_model(
+      _CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
+  )
+  converter.convert_to_tflite(
+      pytorch_model,
+      output_path=_OUTPUT_PATH.value,
+      output_name_prefix=_OUTPUT_NAME_PREFIX.value,
+      prefill_seq_len=_PREFILL_SEQ_LENS.value,
+      quantize=_QUANTIZE.value,
+      lora_ranks=_LORA_RANKS.value,
+      export_config=ExportConfig(),
+  )
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/ai_edge_torch/generative/examples/phi/phi3.py b/ai_edge_torch/generative/examples/phi/phi3.py
@@ -136,10 +136,7 @@ def _build_phi3_rope(
 
 class Phi3_5Mini(model_builder.DecoderOnlyModel):
   """A Phi-3.5 model built from the Edge Generative API layers."""
-
-  def __init__(self, config: cfg.ModelConfig):
-    super().__init__(config)
-    attn_config = self.config.block_config(0).attn_config
+  pass
 
 
 def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
@@ -150,7 +147,7 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       is 1024.
 
   Returns:
-    The model config for a Phi-2 model.
+    The model config for a Phi-3.5 model.
   """
   attn_config = cfg.AttentionConfig(
       num_heads=32,
diff --git a/ai_edge_torch/generative/examples/phi/phi4.py b/ai_edge_torch/generative/examples/phi/phi4.py
@@ -0,0 +1,165 @@
+# Copyright 2024 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Example of building a Phi-4 model up to 4K tokens, not to 128K tokens."""
+
+from functools import partial
+import math
+from typing import Tuple
+
+import ai_edge_torch.generative.layers.model_config as cfg
+from ai_edge_torch.generative.utilities import model_builder
+import ai_edge_torch.generative.utilities.loader as loading_utils
+import torch
+
+TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
+    ff_up_proj="model.layers.{}.mlp.gate_up_proj",
+    ff_down_proj="model.layers.{}.mlp.down_proj",
+    attn_fused_qkv_proj="model.layers.{}.self_attn.qkv_proj",
+    attn_output_proj="model.layers.{}.self_attn.o_proj",
+    pre_attn_norm="model.layers.{}.input_layernorm",
+    post_attn_norm="model.layers.{}.post_attention_layernorm",
+    embedding="model.embed_tokens",
+    final_norm="model.norm",
+)
+
+# max_position_embeddings / original_max_position_embeddings in Phi-4 config.
+ROPE_SCALE_FACTOR = 32
+
+# ROPE short factor in Phi-4 config. According to LOPE paper and its code in
+# https://github.com/microsoft/LongRoPE, these values had been searched with
+# min=1.0, step-0.01 to optimize the errors of sample dataset.
+ROPE_SHORT_FACTOR = [1.0] * 48
+
+
+def _build_phi4_rope(
+    input_pos: int,
+    n_elem: int,
+    base: int,
+    condense_ratio: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    theta_factors: torch.Tensor,
+    scale: float,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+  """Computes Rotary Positional Embeddings for Phi-4 model.
+
+  It's a modified version of attn_utils.build_rope_cache with additional
+  arguments for Phi-4 model. It precompute Rotary Positional Embedding Sin and
+  Cos values with scaling factors for quick lookup during the inference.
+
+  Args:
+      input_pos (torch.Tensor): the given input sequence positions
+      n_elem (int): Each sequence's dimmension.
+      base (int, optional): Rope base value.
+      condense_ratio (int, optional): The ratio by which sequence indicies are
+        condensed.
+      dtype (torch.dtype, optional): Output tensor's data type.
+      device (torch.device, optional): Output tensor's data type.
+      theta_factors (torch.Tensor, optional): A tensor of shape (n_elem,) used
+        to scale the theta values.
+      scale (float, optional): A float used to scale the rope values.
+
+  Returns:
+      Tuple[torch.Tensor, torch.Tensor]: Rope's Cosine and Sine waves.
+  """
+  theta = 1.0 / (base ** (torch.arange(0, n_elem, 2).float() / n_elem))
+  theta = theta / theta_factors
+  seq_idx = input_pos / condense_ratio
+  idx_theta = torch.outer(seq_idx, theta)
+  cos = torch.cos(idx_theta).to(dtype=dtype, device=device) * scale
+  sin = torch.sin(idx_theta).to(dtype=dtype, device=device) * scale
+  return cos, sin
+
+
+class Phi4Mini(model_builder.DecoderOnlyModel):
+  """A Phi-4 model built from the Edge Generative API layers."""
+  pass
+
+
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a Phi-4 model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a Phi-4 model.
+  """
+  attn_config = cfg.AttentionConfig(
+      num_heads=24,
+      head_dim=128,
+      num_query_groups=8,
+      rotary_base=10000,
+      rotary_percentage=0.75,
+      qkv_transpose_before_split=True,
+  )
+  ff_config = cfg.FeedForwardConfig(
+      type=cfg.FeedForwardType.SEQUENTIAL,
+      activation=cfg.ActivationConfig(cfg.ActivationType.SILU_GLU),
+      intermediate_size=8192,
+  )
+  norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
+  block_config = cfg.TransformerBlockConfig(
+      attn_config=attn_config,
+      ff_config=ff_config,
+      pre_attention_norm_config=norm_config,
+      post_attention_norm_config=norm_config,
+  )
+
+  max_seq_len = 4096
+  # Create the RoPE callable
+  build_rope = partial(
+      _build_phi4_rope,
+      condense_ratio=1,
+      dtype=torch.float32,
+      device=torch.device("cpu"),
+      theta_factors=torch.tensor(ROPE_SHORT_FACTOR),
+      scale=math.sqrt(1 + math.log(ROPE_SCALE_FACTOR) / math.log(max_seq_len)),
+  )
+
+  config = cfg.ModelConfig(
+      vocab_size=200064,
+      num_layers=32,
+      max_seq_len=max_seq_len,
+      kv_cache_max_len=kv_cache_max_len,
+      embedding_dim=3072,
+      block_configs=block_config,
+      final_norm_config=norm_config,
+      enable_hlfb=True,
+      build_rope=build_rope,
+  )
+  return config
+
+
+def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
+  config = get_model_config(kv_cache_max_len)
+  config.vocab_size = 128
+  config.num_layers = 2
+  config.max_seq_len = 2 * kv_cache_max_len
+  # Phi-4 has only one block config.
+  config.block_config(0).ff_config.intermediate_size = 128
+  return config
+
+
+def build_model(checkpoint_path: str, **kwargs) -> torch.nn.Module:
+  """Instantiates the model instance and load checkpoint if provided."""
+  return model_builder.build_decoder_only_model(
+      checkpoint_path=checkpoint_path,
+      config=get_model_config(**kwargs),
+      tensor_names=TENSOR_NAMES,
+      model_class=Phi4Mini,
+  )
diff --git a/ai_edge_torch/generative/examples/phi/verify_phi4.py b/ai_edge_torch/generative/examples/phi/verify_phi4.py
@@ -0,0 +1,69 @@
+# Copyright 2025 The AI Edge Torch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Verifies the reauthored Phi-4 model."""
+
+import logging
+import pathlib
+
+from absl import app
+from absl import flags
+from ai_edge_torch.generative.examples.phi import phi4
+from ai_edge_torch.generative.utilities import transformers_verifier
+from ai_edge_torch.generative.utilities import verifier
+import transformers
+
+
+_PROMPTS = flags.DEFINE_multi_string(
+    "prompts",
+    "Instruct: Write an email about the weather Output:",
+    "The input prompts to generate answers.",
+)
+_MAX_NEW_TOKENS = flags.DEFINE_integer(
+    "max_new_tokens",
+    30,
+    "The maximum size of the generated tokens.",
+)
+
+
+def main(_):
+  checkpoint = "microsoft/Phi-4-mini-instruct"
+  logging.info("Loading the original model from: %s", checkpoint)
+  original_model = transformers.AutoModelForCausalLM.from_pretrained(checkpoint)
+
+  # Locate the cached dir.
+  cached_config_file = transformers.utils.cached_file(
+      checkpoint, transformers.utils.CONFIG_NAME
+  )
+  reauthored_checkpoint = pathlib.Path(cached_config_file).parent
+  logging.info("Building the reauthored model from: %s", reauthored_checkpoint)
+  reauthored_model = phi4.build_model(reauthored_checkpoint)
+
+  logging.info("Loading the tokenizer from: %s", checkpoint)
+  tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
+
+  verifier.verify_reauthored_model(
+      original_model=transformers_verifier.TransformersModelWrapper(
+          original_model
+      ),
+      reauthored_model=verifier.ReauthoredModelWrapper(reauthored_model),
+      tokenizer=verifier.TokenizerWrapper(tokenizer),
+      generate_prompts=_PROMPTS.value,
+      max_new_tokens=_MAX_NEW_TOKENS.value,
+  )
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/ai_edge_torch/generative/test/test_model_conversion_large.py b/ai_edge_torch/generative/test/test_model_conversion_large.py
@@ -27,6 +27,7 @@
 from ai_edge_torch.generative.examples.paligemma import paligemma
 from ai_edge_torch.generative.examples.phi import phi2
 from ai_edge_torch.generative.examples.phi import phi3
+from ai_edge_torch.generative.examples.phi import phi4
 from ai_edge_torch.generative.examples.qwen import qwen
 from ai_edge_torch.generative.examples.qwen_vl import qwen_vl
 from ai_edge_torch.generative.examples.smollm import smollm
@@ -139,6 +140,15 @@ def test_phi3(self):
     pytorch_model = phi3.Phi3_5Mini(config).eval()
     self._test_model(config, pytorch_model, "prefill", atol=1e-5, rtol=1e-5)
 
+  @googletest.skipIf(
+      ai_edge_torch.config.in_oss,
+      reason="tests with custom ops are not supported in oss",
+  )
+  def test_phi4(self):
+    config = phi4.get_fake_model_config()
+    pytorch_model = phi4.Phi4Mini(config).eval()
+    self._test_model(config, pytorch_model, "prefill", atol=1e-3, rtol=1e-5)
+
   @googletest.skipIf(
       ai_edge_torch.config.in_oss,
       reason="tests with custom ops are not supported in oss",