Fix broken Phi2 and PaliGemma1

ai-edge-bot · copybara-github · commit c8d0b1914773 · 2025-01-13T17:35:21.000-08:00
- Phi2 does RoPE partially (rotary_percentage=0.4)
- First 40% input tokens must be roped while the rest must be untouched
- PaliGemma1 decoder expects _forward_with_embeds
- Verified that all example's verify.py is now passing

PiperOrigin-RevId: 715167410
diff --git a/ai_edge_torch/generative/examples/gemma/gemma2.py b/ai_edge_torch/generative/examples/gemma/gemma2.py
@@ -143,9 +143,7 @@ def forward(
     # RoPE parameters are the same for all blocks. Use the first layer.
     attn_config = self.config.block_config(0).attn_config
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
-    rope = rotary_pos_emb.build_rope(
-        input_pos, n_elem, attn_config.head_dim, attn_config.rotary_base
-    )
+    rope = rotary_pos_emb.build_rope(input_pos, n_elem, attn_config.rotary_base)
     mask = [
         self.get_attention_mask(
             self.config.block_config(i).attn_config.attn_type, input_pos
diff --git a/ai_edge_torch/generative/examples/llama/llama.py b/ai_edge_torch/generative/examples/llama/llama.py
@@ -37,7 +37,6 @@ def _build_llama3_rope_cache(
     low_freq_factor: float,
     high_freq_factor: float,
     max_seq_len: int,
-    **kwargs,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
   """Computes Rotary Positional Embeddings for Llama 3.2 model.
 
diff --git a/ai_edge_torch/generative/examples/paligemma/decoder.py b/ai_edge_torch/generative/examples/paligemma/decoder.py
@@ -67,9 +67,7 @@ def forward(
     # ROPE parameters for all attn_configs are the same. Take the first one.
     attn_config = self.config.block_config(0).attn_config
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
-    rope = rotary_pos_emb.build_rope(
-        repo_pos, n_elem, attn_config.head_dim, attn_config.rotary_base
-    )
+    rope = rotary_pos_emb.build_rope(repo_pos, n_elem, attn_config.rotary_base)
 
     # The first part of input_embeds are image embeddings. Diagonal causal mask
     # doesn't work here.
diff --git a/ai_edge_torch/generative/examples/paligemma/decoder2.py b/ai_edge_torch/generative/examples/paligemma/decoder2.py
@@ -70,9 +70,7 @@ def forward(
     # ROPE parameters for all attn_configs are the same. Take the first one.
     attn_config = self.config.block_config(0).attn_config
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
-    rope = rotary_pos_emb.build_rope(
-        repo_pos, n_elem, attn_config.head_dim, attn_config.rotary_base
-    )
+    rope = rotary_pos_emb.build_rope(repo_pos, n_elem, attn_config.rotary_base)
 
     if mask is None:
       if called_by_generate:
diff --git a/ai_edge_torch/generative/examples/phi/phi3.py b/ai_edge_torch/generative/examples/phi/phi3.py
@@ -103,7 +103,6 @@ def _build_phi3_rope(
     device: torch.device,
     theta_factors: torch.Tensor,
     scale: float,
-    **kwargs,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
   """Computes Rotary Positional Embeddings for Phi-3.5 model.
 
@@ -173,6 +172,7 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       pre_attention_norm_config=norm_config,
       post_attention_norm_config=norm_config,
   )
+
   max_seq_len = 4096
   # Create the RoPE callable
   build_rope = partial(
@@ -182,7 +182,6 @@ def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
       device=torch.device("cpu"),
       theta_factors=torch.tensor(ROPE_SHORT_FACTOR),
       scale=math.sqrt(1 + math.log(ROPE_SCALE_FACTOR) / math.log(max_seq_len)),
-      max_seq_len=max_seq_len,
   )
 
   config = cfg.ModelConfig(
diff --git a/ai_edge_torch/generative/layers/rotary_position_embedding.py b/ai_edge_torch/generative/layers/rotary_position_embedding.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # Implementation for Rotary Position embedding. https://arxiv.org/pdf/2104.09864.pdf
+
 from typing import Tuple
 import torch
 
@@ -31,18 +32,17 @@ def apply_rope(
     output tensor of RoPE.
   """
   x = x.transpose(1, 2)
-  head_size = x.size(-1)
-  x1, x2 = torch.split(x, head_size // 2, dim=-1)
-  left = x1 * cos - x2 * sin
-  right = x2 * cos + x1 * sin
-  roped = torch.cat([left, right], dim=-1)
+  rope_size = cos.size(-1)
+  x_splited = torch.split(x, rope_size, dim=-1)
+  left = x_splited[0] * cos - x_splited[1] * sin
+  right = x_splited[1] * cos + x_splited[0] * sin
+  roped = torch.cat((left, right) + x_splited[2:], dim=-1)
   return roped.transpose(1, 2).type_as(x)
 
 
 def build_rope(
     input_pos: torch.Tensor,
     n_elem: int,
-    head_dim: int,
     base: int = 10_000,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
   """Computes rotary positional embedding cosine and sine tensors.
@@ -60,7 +60,7 @@ def build_rope(
     return None, None
 
   freq_exponents = (2.0 / n_elem) * torch.arange(
-      head_dim // 2, dtype=torch.float32
+      n_elem // 2, dtype=torch.float32
   )
   timescale = float(base) ** freq_exponents
   radians = input_pos.clone().unsqueeze(0).unsqueeze(-1) / timescale.unsqueeze(
diff --git a/ai_edge_torch/generative/utilities/model_builder.py b/ai_edge_torch/generative/utilities/model_builder.py
@@ -25,7 +25,6 @@
 from ai_edge_torch.generative.layers import lora as lora_utils
 import ai_edge_torch.generative.layers.attention_utils as attn_utils
 import ai_edge_torch.generative.layers.model_config as cfg
-import ai_edge_torch.generative.layers.rotary_position_embedding as rotary_pos_emb
 import ai_edge_torch.generative.utilities.loader as loading_utils
 import torch
 from torch import nn
@@ -115,23 +114,17 @@ def forward(
     # ROPE parameters for all attn_configs are the same. Take the first one.
     attn_config = self.config.block_config(0).attn_config
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
-    rope = self.config.build_rope(
-        input_pos=input_pos,
-        n_elem=n_elem,
-        base=attn_config.rotary_base,
-        head_dim=attn_config.head_dim,
-        # input_pos=input_pos, n_elem=n_elem, base=attn_config.rotary_base
-    )
+    rope = self.config.build_rope(input_pos, n_elem, attn_config.rotary_base)
 
     if mask is None:
       mask = self.mask_cache.index_select(2, input_pos)
       mask = mask[:, :, :, : self.config.kv_cache_max]
 
-    return self.forward_with_embeds(
+    return self._forward_with_embeds(
         input_embeds, rope, mask, input_pos, kv_cache, lora, export_config
     )
 
-  def forward_with_embeds(
+  def _forward_with_embeds(
       self,
       input_embeds: torch.Tensor,
       rope: Tuple[torch.Tensor, torch.Tensor],