Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code sharing"

kimishpatel · kimishpatel · commit 6e8cff565d61 · 2025-01-14T16:08:51.000-08:00
Summary: Why? We have coupled SDPA with kv cache for a while. Initially this was done as we implemented sdpa_with_kv_cache custom op to reduce multiple copy overheads from kv cache update. (This could have been done by having separate custom kv cache update and custom sdpa op. Recent changes enabled this.) As a result of SDPA module owning kv cache, we get a) non-composable implementation and b) harder to reuse model definition and components from repos like tune. Output of this is that we have multiple definition of the same model, llama, lying around in ET, TorchChat and Tune. This diff and subsequent ones will try to move in the direction where custom kv cache and custom sdpa become decoupled and composable, making it more module-swap friendly with tune's model definition. How. Earlier PRs decoupled kv cache update from sdpa. So now 1. Decouple SDPA nn.Module from KV cache. 2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted tensors. 3. 2 will introduce multiple tranposes when KVCache and SDPA are replaced by custom modules, but we will write graph pass to undo those. Test Plan: Existing tests. Make sure perf doesnt regress Differential Revision: [D67914054](https://our.internmc.facebook.com/intern/diff/D67914054) [ghstack-poisoned]
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -309,8 +309,6 @@ def forward(
         seqlen,
         mask: torch.Tensor,
     ) -> torch.Tensor:
-        # TODO(kimishpatel): Move this slicing logic to Attention block so that
-        # SDPA does not have to take input_pos as arg
         if self.enable_dynamic_shape:
             start_pos = input_pos[-1].item()
             torch._check_is_size(start_pos)
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -49,7 +49,7 @@ def __init__(
             )
 
         # For now supporting int8 only
-        self.use_custom_update_cache_op = True
+        self.use_custom_update_cache_op = use_custom_update_cache_op
         self.quantized_cache_dtype = torch.int8
         self.cache_fp_type = torch.float32
         cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
diff --git a/examples/models/llama/tests/test_simple_sdpa.py b/examples/models/llama/tests/test_simple_sdpa.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import copy
 import unittest
 
 import torch
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
@@ -12,6 +12,7 @@ runtime.python_library(
     name = "export_lib",
     srcs = [
         "builder.py",
+        "export_passes.py",
         "partitioner_lib.py",
         "quantizer_lib.py",
     ],
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -34,7 +34,7 @@
 
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 
-from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.export_passes import RemoveRedundantPermutes
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
@@ -113,7 +113,7 @@ def __init__(
         self.calibration_seq_length = calibration_seq_length
         self.calibration_data = calibration_data
         self.tokenizer_path = tokenizer_path
-        self.canonical_passes = [RemoveRedundantTransposes()]
+        self.canonical_passes = [RemoveRedundantPermutes()]
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -227,6 +227,10 @@ def export(self) -> "LLMEdgeManager":
         return self
 
     def run_canonical_optimizations(self):
+        """
+        Run canonical optimizations (at the moment removing redundant permutes) on the model.
+        """
+        assert self.pre_autograd_graph_module is not None, "Please run export() first"
         for pass_instance in self.canonical_passes:
             logging.info(f"Running canonical pass: {pass_instance.__class__.__name__}")
             res = pass_instance(self.pre_autograd_graph_module)
diff --git a/extension/llm/export/export_passes.py b/extension/llm/export/export_passes.py
@@ -19,7 +19,7 @@ def _normalize_dims(tensor: FakeTensor, dim_0: int, dim_1: int):
     return dim_0, dim_1
 
 
-class RemoveRedundantTransposes(ExportPass):
+class RemoveRedundantPermutes(ExportPass):
     """
     This pass removes redundant transpose nodes in the graph.
     It checks if the next node is also a transpose node and if the two transpose nodes undo each other.

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ def __init__(`
`49`	`49`	`)`
`50`	`50`
`51`	`51`	`# For now supporting int8 only`
`52`		`- self.use_custom_update_cache_op = True`
	`52`	`+ self.use_custom_update_cache_op = use_custom_update_cache_op`
`53`	`53`	`self.quantized_cache_dtype = torch.int8`
`54`	`54`	`self.cache_fp_type = torch.float32`
`55`	`55`	`cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)`