LLM export pass to swap in custom SDPA

sxu · facebook-github-bot · commit d266e758089f · 2025-04-22T09:44:44.000-07:00
Differential Revision: D73444078
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
@@ -41,6 +41,18 @@ runtime.python_library(
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_details",
         "//executorch/extension/export_util:export_util",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
         "//pytorch/tokenizers/pytorch_tokenizers:tokenizers",
     ],
 )
+
+runtime.python_test(
+    name = "export_passes_test",
+    srcs = [
+        "test_export_passes.py",
+    ],
+    deps = [
+        ":export_lib",
+    ],
+)
diff --git a/extension/llm/export/export_passes.py b/extension/llm/export/export_passes.py
@@ -4,6 +4,9 @@
 from torch._subclasses import FakeTensor
 from torch.fx.passes.infra.pass_base import PassResult
 
+torch.ops.load_library("//executorch/extension/llm/custom_ops:custom_ops_aot_lib")
+from executorch.extension.llm.custom_ops import custom_ops  # noqa
+
 
 def _normalize_dims(tensor: FakeTensor, dim_0: int, dim_1: int):
     """
@@ -95,3 +98,58 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph_module.recompile()
 
         return PassResult(graph_module, graph_changed)
+
+
+class ReplaceSDPAWithCustomSDPAPass(ExportPass):
+    def call_operator(self, op, args, kwargs, meta):  # pyre-ignore
+        if op != torch.ops.aten.scaled_dot_product_attention.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        q, k, v, *rest = args
+        mask = None
+        dropout = 0.0
+        is_causal = False
+        scale = None
+        if len(rest) > 0:
+            mask = rest[0]
+        if len(rest) > 1:
+            dropout = rest[1]
+        if len(rest) > 2:
+            is_causal = rest[2]
+        if "scale" in kwargs:
+            scale = kwargs["scale"]
+
+        qT = self._transpose(q, meta)
+        kT = self._transpose(k, meta)
+        vT = self._transpose(v, meta)
+
+        if mask is not None and mask.node.meta["val"].dtype == torch.bool:
+            mask = super().call_operator(
+                torch.ops.aten.where.Scalar,
+                (mask, 0.0, float("-inf")),
+                {},
+                meta,
+            )
+
+        custom_sdpa = super().call_operator(
+            torch.ops.llama.custom_sdpa.default,
+            (qT, kT, vT, 0, mask, dropout, is_causal, scale),
+            {},
+            meta,
+        )
+        return self._transpose(custom_sdpa, meta)
+
+    def _transpose(self, x, meta):  # pyre-ignore
+        transpose = super().call_operator(
+            torch.ops.aten.transpose.int,
+            (x, 1, 2),
+            {},
+            meta,
+        )
+        contiguous = super().call_operator(
+            torch.ops.aten.contiguous.default,
+            (transpose,),
+            {},
+            meta,
+        )
+        return contiguous
diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py
@@ -2,7 +2,10 @@
 
 import torch
 
-from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.export_passes import (
+    RemoveRedundantTransposes,
+    ReplaceSDPAWithCustomSDPAPass,
+)
 
 from torch.export import export_for_training
 from torch.testing import FileCheck
@@ -160,3 +163,37 @@ def forward(self, x):
 
         m = TestModule2()
         self._check(m, (x,), key, 3, 2)
+
+
+class ReplaceSDPAWithCustomSDPAPassTest(unittest.TestCase):
+    class TestModule(torch.nn.Module):
+        def forward(self, x, mask, is_causal):
+            return torch.nn.functional.scaled_dot_product_attention(
+                x, x, x, attn_mask=mask, is_causal=is_causal
+            )
+
+    def setUp(self):
+        torch.manual_seed(0)
+
+    def _test(self, *args):
+        m = self.TestModule()
+        gm = export_for_training(m, args, strict=True).module()
+
+        sdpa_key = "torch.ops.aten.scaled_dot_product_attention.default"
+        custom_sdpa_key = "torch.ops.llama.custom_sdpa.default"
+        FileCheck().check_count(sdpa_key, 1, exactly=True).run(gm.code)
+        gm = ReplaceSDPAWithCustomSDPAPass()(gm).graph_module
+        FileCheck().check_count(sdpa_key, 0, exactly=True).run(gm.code)
+        FileCheck().check_count(custom_sdpa_key, 1, exactly=True).run(gm.code)
+
+        y1 = m(*args)
+        y2 = gm(*args)
+        self.assertTrue(torch.allclose(y1, y2))
+
+    def test_causal_mask(self):
+        self._test(torch.rand(1, 4, 32, 64), None, True)
+
+    def test_custom_mask(self):
+        m1 = torch.tril(torch.ones(32, 32, dtype=torch.bool))
+        m2 = torch.tril(torch.ones(32, 32, dtype=torch.bool), diagonal=-16)
+        self._test(torch.rand(1, 4, 32, 64), torch.logical_xor(m1, m2), False)