TransformerBlock: support attention skips

sxu · facebook-github-bot · commit 6d6c3a0cfec9 · 2025-10-06T13:41:27.000-07:00
Summary: We want to support attention skips. This diff modifies `TransformerBlock` to make `attention_norm` and `attention` optional. Since our export script directly constructs the `TransformerBlock`s themselves, this is enough for our use case. The top level `Transformer` class still require a single `attention_type`, to make that interface also support attention skip (which requires different configuration for each layer) is not within the scope of this diff.

Differential Revision: D84003431
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -71,7 +71,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, args: ModelArgs, attention: Attention):
+    def __init__(self, args: ModelArgs, attention: Optional[Attention]):
         """
         Transformer block with support for pre-norm and post-norm.
         Args:
@@ -95,7 +95,8 @@ def __init__(self, args: ModelArgs, attention: Attention):
         else:
             self.feed_forward = FeedForward(dim=args.dim, hidden_dim=args.hidden_dim)
 
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if self.attention is not None:
+            self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
     @classmethod
@@ -107,21 +108,28 @@ def from_type(cls, layer_id, args, rope) -> "TransformerBlock":
             args (ModelArgs): model configuration parameters.
             rope (Rope): the rope object to use for rotary embeddings.
         """
-        if args.attention_type not in ATTENTION_REGISTRY:
-            raise ValueError(
-                f"Unknown attention type: {args.attention_type}. "
-                f"Available: {list(ATTENTION_REGISTRY.keys())}"
-            )
-        cls = ATTENTION_REGISTRY[args.attention_type]
-        attention = cls(args, layer_id, rope, **args.attention_kwargs)
+        if args.attention_type is not None:
+            if args.attention_type not in ATTENTION_REGISTRY:
+                raise ValueError(
+                    f"Unknown attention type: {args.attention_type}. "
+                    f"Available: {list(ATTENTION_REGISTRY.keys())}"
+                )
+            cls = ATTENTION_REGISTRY[args.attention_type]
+            attention = cls(args, layer_id, rope, **args.attention_kwargs)
+        else:
+            attention = None
         return TransformerBlock(args, attention)
 
     def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x: 1xN
-        h, attn_options_update = self.attention.forward(
-            self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
-        )
+        if self.attention is not None:
+            h, attn_options_update = self.attention.forward(
+                self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
+            )
+            h = x + h
+        else:
+            h = x
+            attn_options_update = attn_options
 
-        h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -55,7 +55,7 @@ class ModelArgs:
     moe: bool = False  # True to enable the MoE (Mixture of Experts)
     num_experts: int = 8  # Number of experts
     num_activated_experts: int = 2  # Number of experts to activate
-    attention_type: str = "mha"  # Attention type, registered in attention.py
+    attention_type: Optional[str] = "mha"  # Attention type, registered in attention.py
     norm_type: str = "rmsnorm"  # Normalization type, registered in norm.py
     act_fn: ActFn = dataclasses.field(default=ActFn.SILU)  # Activation function type
     attention_qkv_bias: bool = False