pytorch
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/models/llama/attention.py‎
Lines changed: 238 additions & 0 deletions b/‎examples/models/llama/attention.py‎
Lines changed: 238 additions & 0 deletions
@@ -38,7 +38,8 @@
     update_spill_fill_size,
 )
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.llama_transformer import MOEFeedForward
 
 from executorch.examples.qualcomm.utils import setup_common_args_and_variables
 
 
@@ -0,0 +1,238 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import Rope
+
+class Attention(nn.Module, ABC):
+    """Abstract base class for attention mechanisms with unified interface."""
+    @abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        in_cache_state: Optional[Any] = None,
+        out_cache_state: Optional[Any] = None,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        """Forward pass for attention mechanism.
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, dim)
+            freqs_cos, freqs_sin: Rotary position embedding frequencies
+            mask: Optional attention mask
+            input_pos: Positions for KV cache updates
+            in_cache_state/out_cache_state: Cache states
+
+        Returns:
+            Tuple of (output tensor, updated cache state)
+        """
+        pass
+
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        enable_dynamic_shape: bool,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.max_seq_length = max_seq_length
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.enable_dynamic_shape = enable_dynamic_shape
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # input_pos: [S], k_val: [B, H, S, D]
+        if self.enable_dynamic_shape:
+            start_pos = input_pos[0].item()
+            torch._check_is_size(start_pos)
+            torch._check(start_pos < self.max_seq_length)
+            dim_to_slice = 2
+            seq_length = k_val.size(dim_to_slice)
+            # Replace the entry in the cache for this token
+            # The following lines are equivalent to:
+            # cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+            # cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+            # when dim_to_slice is 1
+            # We use .narrow() here to make the compiler happy
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_k = self.k_cache.narrow(dim_to_slice, start_pos, seq_length)
+            # pyre-ignore: Incompatible parameter type [6]
+            narrowed_v = self.v_cache.narrow(dim_to_slice, start_pos, seq_length)
+
+            narrowed_k.copy_(k_val)
+            narrowed_v.copy_(v_val)
+            return self.k_cache, self.v_cache
+        else:
+            k_out = self.k_cache
+            v_out = self.v_cache
+            k_out[:, :, input_pos] = k_val
+            v_out[:, :, input_pos] = v_val
+
+            return k_out, v_out
+
+class SDPA(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_rep: int,
+        max_seq_len: int,
+        enable_dynamic_shape: bool,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_rep = n_rep
+        self.max_seq_len = max_seq_len
+        self.enable_dynamic_shape = enable_dynamic_shape
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_heads, seqlen, head_dim)
+        k: torch.Tensor,  # Already have rotary embeddings. (bs, n_local_kv_heads, seqlen, head_dim)
+        v: torch.Tensor,  # (bs, n_local_kv_heads, seqlen, head_dim)
+        bsz,
+        seqlen,
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.enable_dynamic_shape:
+            start_pos = input_pos[-1].item()
+            torch._check_is_size(start_pos)
+            torch._check(start_pos < self.max_seq_len)
+            seq_length = q.size(2)
+            # pyre-ignore: Incompatible parameter type [6]
+            attn_mask = mask.narrow(0, start_pos, seq_length)
+        else:
+            attn_mask = mask[None, None, input_pos]
+
+        # TODO(kimishpatel): This should not be necessary because scaled_dot_product_attention
+        # can natively support GQA now. But needs enable_gqa=True
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+class AttentionMHA(Attention):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
+        super().__init__()
+        # Architecture configuration
+        self.use_kv_cache = args.use_kv_cache
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert self.n_heads % self.n_kv_heads == 0, "Head counts must be divisible"
+
+        # Model parallelism preparation (currently 1 for single device)
+        model_parallel_size = 1
+        self.n_local_heads = self.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+
+        # Multi-query attention repetition factor
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.head_dim
+        self.max_batch_size = args.max_batch_size
+        self.max_seq_len = args.max_seq_len
+        self.dim = args.dim
+
+        # Projection layers (combined heads)
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
+
+        # Layer-specific configuration
+        self.layer_id = layer_id
+        self.rope = rope  # Rotary position embedding implementation
+
+        # Causal mask buffer (not saved in model state)
+        causal_mask = torch.tril(
+            torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool, device="cpu")
+        )
+        self.register_buffer("mask", causal_mask, persistent=False)
+
+        # KV Cache initialization if enabled
+        if self.use_kv_cache:
+            self.kv_cache = KVCache(
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_kv_heads,
+                self.head_dim,
+                args.enable_dynamic_shape,
+            )
+            self.SDPA = SDPA(  # Optimized attention implementation
+                dim=self.n_local_heads * self.head_dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
+                max_seq_len=self.max_seq_len,
+                enable_dynamic_shape=args.enable_dynamic_shape,
+            )
+
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            freqs_cos: torch.Tensor,
+            freqs_sin: torch.Tensor,
+            mask: Optional[torch.Tensor] = None,
+            input_pos: Optional[torch.Tensor] = None,
+            in_cache_state: Optional[Any] = None,
+            out_cache_state: Optional[Any] = None,
+        ) -> Tuple[torch.Tensor, Optional[Any]]:
+        bsz, seqlen, _ = x.shape
+
+        # QKV projections with view operations to split heads
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.view(bsz, seqlen, self.n_local_heads, self.head_dim)  # Split into heads
+        k = k.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        # Rotary position embeddings (applied to both queries and keys)
+        q, k = self.rope(q, k, freqs_cos, freqs_sin)
+
+        # Transpose for attention computation: (bs, heads, seqlen, dim)
+        q, k = q.transpose(1, 2), k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        # KV Cache path (optimized for incremental decoding)
+        if self.use_kv_cache:
+            assert input_pos is not None, "input_pos required for cache updates"
+            k, v = self.kv_cache.update(input_pos, k, v)  # Update cache
+            # Use optimized SDPA implementation with cache
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
+            return self.wo(output), None  # No cache state needed for Code A
+
+        # Non-cached path (full sequence processing)
+        # Expand KV heads to match Q heads for grouped multi-query attention
+        k = k.repeat_interleave(self.n_rep, dim=1)
+        v = v.repeat_interleave(self.n_rep, dim=1)
+
+        # Use PyTorch's optimized attention implementation
+        output = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=self.mask[:seqlen, :seqlen],  # Causal mask
+            dropout_p=0.0
+        )
+        # Recombine heads and project to output dimension
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output), None
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,8 @@`
`38`	`38`	`update_spill_fill_size,`
`39`	`39`	`)`
`40`	`40`
`41`		`-from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward`
	`41`	`+from executorch.examples.models.llama.model_args import ModelArgs`
	`42`	`+from executorch.examples.models.llama.llama_transformer import MOEFeedForward`
`42`	`43`
`43`	`44`	`from executorch.examples.qualcomm.utils import setup_common_args_and_variables`
`44`	`45`