Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code sharing"

kimishpatel · kimishpatel · commit 305350d5ee70 · 2025-01-15T21:42:47.000-08:00
Summary: Why? We have coupled SDPA with kv cache for a while. Initially this was done as we implemented sdpa_with_kv_cache custom op to reduce multiple copy overheads from kv cache update. (This could have been done by having separate custom kv cache update and custom sdpa op. Recent changes enabled this.) As a result of SDPA module owning kv cache, we get a) non-composable implementation and b) harder to reuse model definition and components from repos like tune. Output of this is that we have multiple definition of the same model, llama, lying around in ET, TorchChat and Tune. This diff and subsequent ones will try to move in the direction where custom kv cache and custom sdpa become decoupled and composable, making it more module-swap friendly with tune's model definition. How. Earlier PRs decoupled kv cache update from sdpa. So now 1. Decouple SDPA nn.Module from KV cache. 2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted tensors. 3. 2 will introduce multiple tranposes when KVCache and SDPA are replaced by custom modules, but we will write graph pass to undo those. Test Plan: Existing tests. Make sure perf doesnt regress Differential Revision: [D67914054](https://our.internmc.facebook.com/intern/diff/D67914054) [ghstack-poisoned]
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -312,7 +312,7 @@ def get_custom_quant_ios_dtype(
     """
     This function is specific for llama inputs and outputs
     """
-    if node.op == "placeholder" and "attention_sdpa_kv_cache_past_" in node.name:
+    if node.op == "placeholder" and "attention_kv_cache_past_" in node.name:
         return kv_dtype
 
     # Tag index put node before copy node, because copy is a skipped node in qnn
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -286,15 +286,13 @@ def update(
 class SDPA(nn.Module):
     def __init__(
         self,
-        kv_cache: KVCache,
         dim: int,
         head_dim: int,
         n_rep: int,
         max_seq_len: int,
         enable_dynamic_shape: bool,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
         self.dim = dim
         self.head_dim = head_dim
         self.n_rep = n_rep
@@ -373,7 +371,6 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
                 args.enable_dynamic_shape,
             )
             self.SDPA = SDPA(
-                kv_cache=self.kv_cache,
                 dim=self.n_local_heads * self.head_dim,
                 head_dim=self.head_dim,
                 n_rep=self.n_rep,
@@ -406,7 +403,7 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
-            k, v = self.SDPA.kv_cache.update(input_pos, k, v)
+            k, v = self.kv_cache.update(input_pos, k, v)
             output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
             return self.wo(output)
 
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -111,7 +111,6 @@ def __init__(
         self,
         n_heads: int,
         head_dim: int,
-        transpose_cache: bool,
         enable_dynamic_shape: bool,
         rope: RopeWithAttentionSink,
         window_size: int,
@@ -125,7 +124,6 @@ def __init__(
             max_seq_length=window_size + sink_size,
             n_heads=n_heads,
             head_dim=head_dim,
-            transpose_cache=transpose_cache,
             enable_dynamic_shape=enable_dynamic_shape,
             dtype=dtype,
         )
@@ -161,28 +159,17 @@ def evict_tokens(self, input_pos: torch.Tensor, seq_len: int) -> int:
                 input_pos_item + self.position_shift - self.sink_size - num_to_evict
             )
             num_empty_space = self.window_size - num_to_keep
-            dim_to_slice = 2 if self.transpose_cache else 1
+            dim_to_slice = 2
             k_to_keep = self.k_cache.narrow(
                 dim_to_slice,
                 self.sink_size + num_to_evict,  # pyre-ignore [6]
                 num_to_keep,  # pyre-ignore [6]
             )
-            if self.transpose_cache:
-                k_to_keep = self.rope.rerotate_k(
-                    k=k_to_keep.transpose(1, 2),
-                    original_position=(  # pyre-ignore [6]
-                        self.sink_size + num_to_evict
-                    ),
-                    new_position=self.sink_size,
-                ).transpose(1, 2)
-            else:
-                k_to_keep = self.rope.rerotate_k(
-                    k=k_to_keep,
-                    original_position=(  # pyre-ignore [6]
-                        self.sink_size + num_to_evict
-                    ),
-                    new_position=self.sink_size,
-                )
+            k_to_keep = self.rope.rerotate_k(
+                k=k_to_keep.transpose(1, 2),
+                original_position=(self.sink_size + num_to_evict),  # pyre-ignore [6]
+                new_position=self.sink_size,
+            ).transpose(1, 2)
             self.k_cache = torch.cat(
                 [
                     self.k_cache.narrow(dim_to_slice, 0, self.sink_size),
@@ -278,7 +265,6 @@ def _replace_attention(
             kv_cache_with_attention_sink = KVCacheWithAttentionSink(
                 n_heads=kv_cache.n_heads,
                 head_dim=kv_cache.head_dim,
-                transpose_cache=kv_cache.transpose_cache,
                 enable_dynamic_shape=kv_cache.enable_dynamic_shape,
                 rope=rope_with_attention_sink,
                 max_batch_size=kv_cache.max_batch_size,
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -145,8 +145,12 @@ def update(self, input_pos, k_val, v_val):
         )
 
         start_pos = input_pos[0].item()
-        _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
-        _ = torch.ops.llama.update_cache(v_val, v_out, start_pos)
+        if self.use_custom_update_cache_op:
+            _ = torch.ops.llama.update_cache(k_val, k_out, start_pos)
+            _ = torch.ops.llama.update_cache(v_val, v_out, start_pos)
+        else:
+            k_out[:, :, input_pos] = k_val
+            v_out[:, :, input_pos] = v_val
 
         return k_out.transpose(1, 2), v_out.transpose(1, 2)
 
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -19,11 +19,9 @@
 class SDPACustom(torch.nn.Module):
     def __init__(
         self,
-        kv_cache: KVCache,
         dim: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
         self.dim = dim
 
     def forward(
@@ -65,7 +63,7 @@ def _replace_sdpa_with_custom_op(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPACustom(child.kv_cache, child.dim),
+                SDPACustom(child.dim),
             )
         else:
             _replace_sdpa_with_custom_op(child)
@@ -81,13 +79,11 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
 class SDPASimple(torch.nn.Module):
     def __init__(
         self,
-        kv_cache: KVCache,
         dim: int,
         head_dim: int,
         n_rep: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
         self.dim = dim
         self.head_dim = head_dim
         self.n_rep = n_rep
@@ -135,12 +131,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class SDPAFlex(torch.nn.Module):
     def __init__(
         self,
-        kv_cache: KVCache,
         dim: int,
         n_rep: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
         self.dim = dim
         self.n_rep = n_rep
 
@@ -177,7 +171,7 @@ def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPASimple(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+                SDPASimple(child.dim, child.head_dim, child.n_rep),
             )
         else:
             replace_sdpa_with_simple_sdpa(child)
@@ -190,7 +184,7 @@ def replace_sdpa_with_flex_sdpa(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPAFlex(child.kv_cache, child.dim, child.n_rep),
+                SDPAFlex(child.dim, child.n_rep),
             )
         else:
             replace_sdpa_with_flex_sdpa(child)
@@ -222,13 +216,11 @@ class SDPACoreML(torch.nn.Module):
 
     def __init__(
         self,
-        kv_cache: KVCache,
         dim: int,
         head_dim: int,
         n_rep: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
         self.dim = dim
         self.head_dim = head_dim
         self.n_rep = n_rep
@@ -260,7 +252,7 @@ def replace_sdpa_with_coreml_sdpa(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPACoreML(child.kv_cache, child.dim, child.head_dim, child.n_rep),
+                SDPACoreML(child.dim, child.head_dim, child.n_rep),
             )
         else:
             replace_sdpa_with_coreml_sdpa(child)
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py