Update on " [ExecuTorch][BE] Split kv cache and SDPA for better code sharing"

kimishpatel · kimishpatel · commit 275144b2149c · 2024-12-20T16:09:47.000-08:00
Summary:

Why?
We have coupled SDPA with kv cache for a while. Initially this was done
as we implemented sdpa_with_kv_cache custom op to reduce multiple copy
overheads from kv cache update. (This could have been done by having
separate custom kv cache update and custom sdpa op. Recent changes
enabled this.)
As a result of SDPA module owning kv cache, we get a) non-composable
implementation and b) harder to reuse model definition and components
from repos like tune. Output of this is that we have multiple definition
of the same model, llama, lying around in ET, TorchChat and Tune. This
diff and subsequent ones will try to move in the direction where custom
kv cache and custom sdpa become decoupled and composable, making it more
module-swap friendly with tune's model definition.

How.
Earlier PRs decoupled kv cache update from sdpa. So now
1. Decouple SDPA nn.Module from KV cache.
2. Standardize on KVCache and SDPA interface. That is KVCache and SDPA
   both operate on q, k, v in [B, # heads, seq_len, head_dim] formatted
   tensors.
3. 2 will introduce multiple tranposes when KVCache and SDPA are
   replaced by custom modules, but we will write graph pass to undo
   those.

Test Plan:
Existing tests.
Make sure perf doesnt regress

[ghstack-poisoned]
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -657,6 +657,8 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     # export_to_edge
     builder_exported = _prepare_for_llama_export(args).export()
 
+    builder_exported.run_canonical_optimizations()
+
     if args.export_only:
         exit()
 
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -37,6 +37,8 @@
 from torch.export import export_for_training
 from torch.nn.attention import SDPBackend
 
+from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
@@ -108,6 +110,7 @@ def __init__(
         self.calibration_seq_length = calibration_seq_length
         self.calibration_data = calibration_data
         self.tokenizer_path = tokenizer_path
+        self.canonical_passes = [RemoveRedundantTransposes()]
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -212,6 +215,13 @@ def export(self) -> "LLMEdgeManager":
 
         return self
 
+    def run_canonical_optimizations(self):
+        for pass_instance in self.canonical_passes:
+            logging.info(f"Running canonical pass: {pass_instance.__class__.__name__}")
+            res = pass_instance(self.pre_autograd_graph_module)
+            assert res.graph_module is not None, "Pass returned None"
+            self.pre_autograd_graph_module = res.graph_module
+
     def pt2e_calibrate(
         self,
         prepared_module,
diff --git a/extension/llm/export/export_passes.py b/extension/llm/export/export_passes.py
@@ -0,0 +1,80 @@
+import torch
+from torch._subclasses import FakeTensor
+
+from executorch.exir.pass_base import ExportPass
+from torch.fx.passes.infra.pass_base import PassResult
+
+def _normalize_dims(tensor: FakeTensor, dim_0: int, dim_1: int):
+    """
+    Normalize the dimensions of a tensor.
+    """
+    assert tensor is not None, "Tensor is None"
+    ndim = tensor.ndim
+    if dim_0 < 0:
+        dim_0 = ndim + dim_0
+    if dim_1 < 0:
+        dim_1 = ndim + dim_1
+    assert dim_0 < ndim and dim_1 < ndim, f"Invalid dimensions: {dim_0}, {dim_1}"
+    return dim_0, dim_1
+
+class RemoveRedundantTransposes(ExportPass):
+    """
+    This pass removes redundant transpose nodes in the graph.
+    It checks if the next node is also a transpose node and if the two transpose nodes undo each other.
+    For example, if the graph has the following nodes:
+
+    node1 = torch.ops.aten.transpose.int(x, 0, 1)
+    node2 = torch.ops.aten.transpose.int(node1, 0, 1)
+
+    Then node2's use can be replaced by x
+
+    It will also check for permute nodes
+    node1 = torch.ops.aten.permute(x, [0, 2, 1])
+    node2 = torch.ops.aten.permute(node1, [0, 2, 1])
+
+    Then also node2's use can be replaced by x
+
+    NB: Does not work for inplace ops or functionalized _copy suffix ops
+    """
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph_changed = False
+        for node in graph_module.graph.nodes:
+            if node.op == 'call_function' and node.target == torch.ops.aten.transpose.int:
+                # Check if the next node is also a transpose node
+                tranpose_users = list(node.users.keys())
+                dim_0 = node.args[1]
+                dim_1 = node.args[2]
+                dim_0, dim_1 = _normalize_dims(node.args[0].meta["val"], dim_0, dim_1)
+
+                for user in tranpose_users:
+                    if user.op == 'call_function' and user.target == torch.ops.aten.transpose.int:
+                        # Get the arguments of the current and next transpose nodes
+                        user_dim_0 = user.args[1]
+                        user_dim_1 = user.args[2]
+                        user_dim_0, user_dim_1 = _normalize_dims(user.args[0].meta["val"], user_dim_0, user_dim_1)
+                        
+                        # Check if the two transpose nodes undo each other
+                        if dim_0 == user_dim_0 and dim_1 == user_dim_1:
+                            graph_changed = True
+                            user.replace_all_uses_with(node.args[0])
+
+        for node in graph_module.graph.nodes:
+            if node.op == 'call_function' and node.target == torch.ops.aten.permute.default:
+                # Check if the next node is also a transpose node
+                permute_users = list(node.users.keys())
+                dim_list = node.args[1]
+
+                for user in permute_users:
+                    if user.op == 'call_function' and user.target == torch.ops.aten.permute.default:
+                        # Get the arguments of the current and next transpose nodes
+                        user_dim_list = user.args[1]
+
+                        # Check if the two permutes undo each other
+                        if dim_list == user_dim_list:
+                            graph_changed = True
+                            user.replace_all_uses_with(node.args[0])
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+
+        return PassResult(graph_module, graph_changed)
diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py
@@ -0,0 +1,167 @@
+import unittest
+import os
+
+import torch
+from torch.testing import FileCheck
+
+from torch.export import export_for_training
+
+from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+
+class RemoveRedundantTransposesPassTest(unittest.TestCase):
+    def _export(self, model, example_inputs):
+        exported_module = export_for_training(
+            model,
+            example_inputs,
+        )
+        return exported_module.module()
+
+    def _check(self, model, example_inputs, key, before_count, after_count):
+        gm = self._export(model, example_inputs)
+        FileCheck().check_count(key, before_count, exactly=True).run(
+            gm.code
+        )
+        pass_res = RemoveRedundantTransposes()(gm)
+        FileCheck().check_count(key, after_count, exactly=True).run(
+            pass_res.graph_module.code
+        )
+
+    def test_transpose_removal(self):
+        class TestModule1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.transpose(x, 1, 2)
+                x = torch.transpose(x, 1, 2)
+                return x + 1
+
+        class TestModule2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.transpose(x, 1, 2)
+                x = torch.transpose(x, 1, 2)
+                x =  x + 1
+
+                x = torch.transpose(x, 2, 3)
+                x = torch.transpose(x, 2, 3)
+
+                return x + 2
+
+        x = torch.rand((1, 2, 3, 4))
+        key = "torch.ops.aten.transpose.int"
+        m = TestModule1()
+        self._check(m, (x,), key, 2, 0)
+
+        m = TestModule2()
+        self._check(m, (x,), key, 4, 0)
+
+    def test_transpose_no_removal(self):
+        class TestModule1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.transpose(x, 1, 2)
+                x = torch.transpose(x, 1, 2)
+                x =  x + 1
+
+                x = torch.transpose(x, 2, 3)
+                x = torch.transpose(x, 1, 2)
+
+                return x + 2
+
+        x = torch.rand((1, 2, 3, 4))
+        key = "torch.ops.aten.transpose.int"
+
+        m = TestModule1()
+        self._check(m, (x,), key, 4, 2)
+
+        class TestModule2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x_1 = torch.transpose(x, 1, 2)
+                x_2 = torch.transpose(x_1, 1, 2)
+                x_2 =  x_2 + 1
+
+                x = x_1 + 2
+                x = torch.transpose(x, 1, 2)
+
+                return x + x_2
+
+        m = TestModule2()
+        self._check(m, (x,), key, 3, 2)
+
+    def test_permute_removal(self):
+        class TestModule1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.permute(x, [0, 2, 1, 3])
+                x = torch.permute(x, [0, 2, 1, 3])
+                return x + 1
+
+        class TestModule2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.permute(x, [0, 2, 1, 3])
+                x = torch.permute(x, [0, 2, 1, 3])
+                x =  x + 1
+
+                x = torch.permute(x, [0, 1, 3, 2])
+                x = torch.permute(x, [0, 1, 3, 2])
+
+                return x + 2
+
+        x = torch.rand((1, 2, 3, 4))
+        key = "torch.ops.aten.permute.default"
+        m = TestModule1()
+        self._check(m, (x,), key, 2, 0)
+
+        m = TestModule2()
+        self._check(m, (x,), key, 4, 0)
+
+    def test_permute_no_removal(self):
+        class TestModule1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.permute(x, [0, 2, 1, 3])
+                x = torch.permute(x, [0, 2, 1, 3])
+                x =  x + 1
+
+                x = torch.permute(x, [0, 1, 3, 2])
+                x = torch.permute(x, [0, 2, 1, 3])
+
+                return x + 2
+
+        x = torch.rand((1, 2, 3, 4))
+        key = "torch.ops.aten.permute.default"
+
+        m = TestModule1()
+        self._check(m, (x,), key, 4, 2)
+
+        class TestModule2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x_1 = torch.permute(x, [0, 2, 1, 3])
+                x_2 = torch.permute(x_1, [0, 2, 1, 3])
+                x_2 =  x_2 + 1
+
+                x = x_1 + 2
+                x = torch.permute(x, [0, 2, 1, 3])
+
+                return x + x_2
+
+        m = TestModule2()
+        self._check(m, (x,), key, 3, 2)