Add abstract base class for attention mechanisms with unified interface

Martin Yuan · Martin Yuan · commit 09786d8bc711 · 2025-01-29T11:17:47.000-08:00
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -0,0 +1,135 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from executorch.examples.models.llama.llama_transformer import ModelArgs
+
+class Attention(nn.Module, ABC):
+    """Abstract base class for attention mechanisms with unified interface."""
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        in_cache_state: Optional[Any] = None,
+        out_cache_state: Optional[Any] = None,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        """Forward pass for attention mechanism.
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, dim)
+            freqs_cos, freqs_sin: Rotary position embedding frequencies
+            mask: Optional attention mask
+            input_pos: Positions for KV cache updates
+            in_cache_state/out_cache_state: Cache states
+
+        Returns:
+            Tuple of (output tensor, updated cache state)
+        """
+        pass
+
+class AttentionMHA(Attention):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
+        super().__init__()
+        # Architecture configuration
+        self.use_kv_cache = args.use_kv_cache
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert self.n_heads % self.n_kv_heads == 0, "Head counts must be divisible"
+
+        # Model parallelism preparation (currently 1 for single device)
+        model_parallel_size = 1
+        self.n_local_heads = self.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+
+        # Multi-query attention repetition factor
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.head_dim
+        self.max_batch_size = args.max_batch_size
+        self.max_seq_len = args.max_seq_len
+        self.dim = args.dim
+
+        # Projection layers (combined heads)
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        # Layer-specific configuration
+        self.layer_id = layer_id
+        self.rope = rope  # Rotary position embedding implementation
+
+        # Causal mask buffer (not saved in model state)
+        causal_mask = torch.tril(
+            torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool, device="cpu")
+        )
+        self.register_buffer("mask", causal_mask, persistent=False)
+
+        # KV Cache initialization if enabled
+        if self.use_kv_cache:
+            self.kv_cache = KVCache(
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_kv_heads,
+                self.head_dim,
+                args.enable_dynamic_shape,
+            )
+            self.SDPA = SDPA(  # Optimized attention implementation
+                dim=self.n_local_heads * self.head_dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+                max_seq_len=self.max_seq_len,
+                enable_dynamic_shape=args.enable_dynamic_shape,
+            )
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            freqs_cos: torch.Tensor,
+            freqs_sin: torch.Tensor,
+            mask: Optional[torch.Tensor] = None,
+            input_pos: Optional[torch.Tensor] = None,
+            in_cache_state: Optional[Any] = None,
+            out_cache_state: Optional[Any] = None,
+        ) -> Tuple[torch.Tensor, Optional[Any]]:
+        bsz, seqlen, _ = x.shape
+
+        # QKV projections with view operations to split heads
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)  # Split into heads
+        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # Rotary position embeddings (applied to both queries and keys)
+        q, k = self.rope(q, k, freqs_cos, freqs_sin)
+
+        # Transpose for attention computation: (bs, heads, seqlen, dim)
+        q, k = q.transpose(1, 2), k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        # KV Cache path (optimized for incremental decoding)
+        if self.use_kv_cache:
+            assert input_pos is not None, "input_pos required for cache updates"
+            k, v = self.kv_cache.update(input_pos, k, v)  # Update cache
+            # Use optimized SDPA implementation with cache
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output), None  # No cache state needed for Code A
+
+        # Non-cached path (full sequence processing)
+        # Expand KV heads to match Q heads for grouped multi-query attention
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+
+        # Use PyTorch's optimized attention implementation
+        output = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=self.mask[:seqlen, :seqlen],  # Causal mask
+            dropout_p=0.0
+        )
+        # Recombine heads and project to output dimension
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output), None
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -13,6 +13,9 @@
 
 import torch
 import torch.nn.functional as F
+from executorch.examples.models.llama.attention import (
+    AttentionMHA,
+)
 
 from executorch.examples.models.llama.rope import (
     hf_apply_rotary_emb,
@@ -328,102 +331,6 @@ def forward(
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
-        super().__init__()
-        self.use_kv_cache = args.use_kv_cache
-        self.n_heads = args.n_heads
-        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert self.n_heads % self.n_kv_heads == 0
-        model_parallel_size = 1
-        self.n_local_heads = self.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.head_dim
-        self.max_batch_size = args.max_batch_size
-        self.max_seq_len = args.max_seq_len
-        self.dim = args.dim
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
-
-        self.layer_id = layer_id
-
-        self.rope = rope
-
-        causal_mask = torch.tril(
-            torch.ones(
-                self.max_seq_len,
-                self.max_seq_len,
-                dtype=torch.bool,
-                device="cpu",
-            )
-        )
-        self.register_buffer("mask", causal_mask, persistent=False)
-
-        if self.use_kv_cache:
-            self.kv_cache = KVCache(
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_kv_heads,
-                self.head_dim,
-                args.enable_dynamic_shape,
-            )
-            self.SDPA = SDPA(
-                dim=self.n_local_heads * self.head_dim,
-                head_dim=self.head_dim,
-                n_rep=self.n_rep,
-                max_seq_len=self.max_seq_len,
-                enable_dynamic_shape=args.enable_dynamic_shape,
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-        input_pos: Optional[torch.Tensor] = None,
-    ):
-        bsz, seqlen, _ = x.shape
-
-        # QKV
-        q, k, v = self.wq(x), self.wk(x), self.wv(x)
-        # We need view_copy elimination
-        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        # RoPE relative positional embeddings
-        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
-
-        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-
-        if self.use_kv_cache:
-            assert input_pos is not None
-            k, v = self.kv_cache.update(input_pos, k, v)
-            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
-            return self.wo(output)
-
-        # grouped multiquery attention: expand out keys and values
-        k = k.repeat_interleave(self.n_rep, dim=1)
-        v = v.repeat_interleave(self.n_rep, dim=1)
-
-        assert hasattr(self, "mask")
-
-        mask = self.mask[:seqlen, :seqlen]
-
-        output = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
-
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-
-        output = self.wo(output)
-
-        return output
-
-
 class FeedForward(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
@@ -490,7 +397,7 @@ def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        self.attention = Attention(args, layer_id, rope)
+        self.attention = AttentionMHA(args, layer_id, rope)
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
@@ -500,7 +407,7 @@ def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
 
     def forward(self, x, freqs_cos, freqs_sin, input_pos=None):  # x: 1xN
         h = self.attention.forward(
-            self.attention_norm(x), freqs_cos, freqs_sin, input_pos
+            self.attention_norm(x), freqs_cos, freqs_sin, input_pos=input_pos
         )
 
         h = x + h