TransformerBlock: support attention skips (#14826)

sxu · facebook-github-bot · commit 99767f9f14f7 · 2025-10-07T10:38:21.000-07:00
Summary:

We want to support attention skips. This diff modifies `TransformerBlock` to make `attention_norm` and `attention` optional. Since our export script directly constructs the `TransformerBlock`s themselves, this is enough for our use case. The top level `Transformer` class still require a single `attention_type`, to make that interface also support attention skip (which requires different configuration for each layer) is not within the scope of this diff.

Reviewed By: billmguo

Differential Revision: D84003431
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -516,3 +516,15 @@ def forward(
         output = self.wo(output)
 
         return output, None
+
+
+@register_attention("skip")
+class AttentionSkip(Attention):
+    def forward(
+        self,
+        x: torch.Tensor,
+        _freqs_cos: torch.Tensor,
+        _freqs_sin: torch.Tensor,
+        **_kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        return x, None
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -15,6 +15,7 @@
 from executorch.examples.models.llama.attention import (
     Attention,
     ATTENTION_REGISTRY,
+    AttentionSkip,
     ForwardOptions,
 )
 from executorch.examples.models.llama.feed_forward import FeedForward
@@ -95,7 +96,8 @@ def __init__(self, args: ModelArgs, attention: Attention):
         else:
             self.feed_forward = FeedForward(dim=args.dim, hidden_dim=args.hidden_dim)
 
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if isinstance(self.attention, AttentionSkip):
+            self.attention_norm = nn.Identity()
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
     @classmethod
@@ -120,8 +122,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
         h, attn_options_update = self.attention.forward(
             self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
         )
+        if not isinstance(self.attention, AttentionSkip):
+            h = x + h
 
-        h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else: