Move the transpose matmul pass to OSS and run it earlier in the flow (#10433)

mcremon-meta · facebook-github-bot · commit 8c3b621be01e · 2025-04-25T17:08:01.000-07:00
Summary:

That pass is doing a lot more than it looks, and it's just easier to move it back to where it was. CPU backends will possibly see more cycles due to added permutes, but we don't care about that. All DSP backends should be more efficient on transposed matmuls. Should that not be the case in the future, we can re-evaluate.

When we do the survey of passes and reorder them properly, we can think about this more.

Reviewed By: hsharma35

Differential Revision: D73600069
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
@@ -32,7 +32,10 @@
     is_quantized_tensor,
     quantize_tensor_multiplier,
 )
-from executorch.backends.cadence.aot.fuse_ops import FuseCascadedViewOps
+from executorch.backends.cadence.aot.fuse_ops import (
+    FuseCascadedTransposeOrPermuteOps,
+    FuseCascadedViewOps,
+)
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
     register_cadence_pass,
@@ -2290,6 +2293,101 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceMatmulWithTransposedMatmulPass(ExportPass):
+    """
+    For certain backends, we have efficient kernels for transposed matmul. We
+    replace AxB with AxB' for such backends.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != exir_ops.edge.cadence.quantized_matmul.default or args[-1] is True:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Get the args
+        if len(args) == 9:
+            (
+                X_arg,
+                X_zero_point,
+                Y_arg,
+                Y_zero_point,
+                bias,
+                out_multiplier,
+                out_shift,
+                out_zero_point,
+                transposed,
+            ) = args
+        elif len(args) == 8:
+            (
+                X_arg,
+                X_zero_point,
+                Y_arg,
+                Y_zero_point,
+                bias,
+                out_multiplier,
+                out_shift,
+                out_zero_point,
+            ) = args
+            transposed = False
+        else:
+            raise AssertionError(
+                f"Unexpected number of args for quantized_matmul: {len(args)}"
+            )
+
+        # If the matmul is already transposed, bail
+        if transposed:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Get the second tensor
+        Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+        # Concretize the bias
+        zero_bias = super().call_operator(
+            exir_ops.edge.aten.full.default,
+            ([Y_tensor.size(-1)], 0),
+            {"dtype": torch.int32},
+            meta,
+        )
+
+        # If the arg was a ProxyValue, insert a transpose node. Otherwise we
+        # can simply transpose the tensor inplace.
+        if isinstance(Y_arg, ProxyValue):
+            transpose_args = (Y_arg, -1, -2)
+            transpose_node = super().call_operator(
+                exir_ops.edge.aten.transpose_copy.int,
+                transpose_args,
+                {},
+                meta,
+            )
+            Y_arg_t = transpose_node
+        else:
+            Y_arg_t = Y_tensor.transpose(-1, -2)
+
+        # Construct the new args, and return the transposed matmult op
+        new_args = (
+            X_arg,
+            X_zero_point,
+            Y_arg_t,
+            Y_zero_point,
+            zero_bias,
+            out_multiplier,
+            out_shift,
+            out_zero_point,
+            True,
+        )
+        return super().call_operator(op, new_args, kwargs, meta)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        result = super().call(graph_module)
+        # Fuse any inserted transpose node with transpose/permute nodes
+        # surrounding it.
+        result = FuseCascadedTransposeOrPermuteOps()(result.graph_module)
+        assert result is not None
+        # Replace permute with transpose.
+        result = ReplacePermuteWithTransposePass()(result.graph_module)
+        assert result is not None
+        return result
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2317,6 +2415,7 @@ class CadenceReplaceOpsInGraph:
         # This pass should be after passes that replace conv -> im2row + linear.
         ReplaceIm2RowWithViewPass,
         MakeSliceAndCatDimOutermostPass,
+        ReplaceMatmulWithTransposedMatmulPass,
         ReplaceNopTransposeOrPermuteWithViewPass,
         ReplaceLinearWithFullyConnectedOpPass,
         ReplaceScalarTensorWithFullPass,
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -35,6 +35,7 @@
     ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
+    ReplaceMatmulWithTransposedMatmulPass,
     ReplaceMMWithAddMMPass,
     ReplaceNopTransposeOrPermuteWithViewPass,
     ReplacePadWithCatPass,
@@ -85,6 +86,50 @@ def assertTargetCountsEqual(
         for target, expected_count in targets_and_counts:
             self.assertTargetCountEqual(graph_module, target, expected_count)
 
+    @parameterized.expand(
+        [
+            # Regular MM
+            [(64, 33), (33, 128)],
+            # Batched MM
+            [(2, 48, 48), (2, 48, 48)],
+        ]
+    )
+    @torch.no_grad()
+    def test_replace_matmul_with_transposed_matmul(
+        self,
+        x_shape: Tuple[int],
+        y_shape: Tuple[int],
+    ) -> None:
+        class MatMul(torch.nn.Module):
+            def __init__(self) -> None:
+                super(MatMul, self).__init__()
+
+            def forward(self, x, y):
+                return torch.matmul(x, y)
+
+        model = MatMul()
+        X = torch.randn(x_shape)
+        Y = torch.randn(y_shape)
+        p = ReplaceMatmulWithTransposedMatmulPass()
+        inputs = (X, Y)
+        quantized_model = quantize_pt2(model, inputs)
+        graph_module = (
+            export_to_edge(quantized_model, inputs).exported_program().graph_module
+        )
+        # pyre-fixme[16]: Optional type has no attribute `graph_module`
+        graph_after_passes = p(graph_module).graph_module
+
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.transpose_copy.int),
+            1,
+        )
+        self.assertEqual(
+            count_node(
+                graph_after_passes, exir_ops.edge.cadence.quantized_matmul.default
+            ),
+            1,
+        )
+
     @parameterized.expand(
         [
             [(3, 5), (0, 0)],