add fp32 bmm op

mcr229 · web-flow · commit a70d0709d740 · 2024-08-09T18:36:56.000-07:00
Differential Revision: D60153721 Pull Request resolved: #4604
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
@@ -10,6 +10,7 @@
     op_add,
     op_addmm,
     op_avg_pooling2d,
+    op_bmm,
     op_cat,
     op_ceiling,
     op_clamp,
diff --git a/backends/xnnpack/operators/op_bmm.py b/backends/xnnpack/operators/op_bmm.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNBatchMatrixMultiply,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class BMMVisitor(NodeVisitor):
+    target = "aten.bmm.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input1
+        input1_id = vals_to_ids[get_input_node(node, 0)]
+
+        # input2
+        input2_id = vals_to_ids[get_input_node(node, 1)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNBatchMatrixMultiply(
+                input1_id=input1_id, input2_id=input2_id, output_id=output_id, flags=0
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -17,6 +17,7 @@
     AbsConfig,
     AddConfig,
     AvgPoolingConfig,
+    BMMConfig,
     CatConfig,
     CeilConfig,
     ClampConfig,
@@ -60,6 +61,7 @@
     AddmmConfig,
     AvgPoolingConfig,
     BatchNormConfig,
+    BMMConfig,
     CatConfig,
     CeilConfig,
     ConstantPadConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -403,3 +403,15 @@ class SubConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT]
+
+
+class BMMConfig(GenericNodePartitionerConfig):
+    """
+    Despite being a GEMM Kernel, BMM Can be partitioned like a single node partitioner
+    because it does not perform any packing on the inputs being matrix multiplied
+    """
+
+    target_name = "bmm.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1504,6 +1504,35 @@ Error defineScaledDotProductAttentionNode(
 
   return Error::Ok;
 }
+
+/*
+Defines batch matrix multiply node into the subgraph,
+using the remapped ids to map the serialized ids,
+to the new ids generated when defining the tensor value
+*/
+Error defineBatchMatrixMultiplyNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node) noexcept {
+  auto graph_node = node->xnode_union_as_XNNBatchMatrixMultiply();
+
+  xnn_status status = xnn_define_batch_matrix_multiply(
+      subgraph_ptr,
+      remapped_ids.at(graph_node->input1_id()),
+      remapped_ids.at(graph_node->input2_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create BMM node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Returns not Implemented Error code. This function is meant to be
 called when the compiler encountes a XNodeType from the flatbuffer
@@ -1566,6 +1595,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Concatenate4)
     _DEFINE(StaticSlice)
     _DEFINE(ScaledDotProductAttention)
+    _DEFINE(BatchMatrixMultiply)
     case fb_xnnpack::XNodeUnion::NONE:
     default: // Adding here as a catch all, just in case
       return &defineNotImplementedNode;
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -134,6 +134,7 @@ union XNodeUnion {
   XNNConcatenate4: _XNNCat,
   XNNStaticSlice,
   XNNScaledDotProductAttention,
+  XNNBatchMatrixMultiply: _XNNNode2x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
@@ -130,6 +130,7 @@ union XNodeUnion {
   XNNConcatenate4: _XNNCat,
   XNNStaticSlice,
   XNNScaledDotProductAttention,
+  XNNBatchMatrixMultiply: _XNNNode2x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -177,6 +177,11 @@ class XNNConcatenate4(XNNCat):
     pass
 
 
+@dataclass
+class XNNBatchMatrixMultiply(XNNNode2x1):
+    pass
+
+
 @dataclass
 class XNNStaticTranspose:
     num_dims: int
@@ -354,6 +359,7 @@ class XNNScaledDotProductAttention:
     XNNConcatenate4,
     XNNStaticSlice,
     XNNScaledDotProductAttention,
+    XNNBatchMatrixMultiply,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/bmm.py b/backends/xnnpack/test/ops/bmm.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestBMM(unittest.TestCase):
+    class BMM(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.bmm(x, y)
+
+    def _test_bmm(self, inputs):
+        (
+            Tester(self.BMM(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_bmm(self):
+        inputs = (
+            torch.randn(2, 3, 4).to(torch.float16),
+            torch.randn(2, 4, 6).to(torch.float16),
+        )
+        self._test_bmm(inputs)
+
+    def test_fp32_bmm(self):
+        inputs = (
+            torch.randn(2, 3, 4),
+            torch.randn(2, 4, 6),
+        )
+        self._test_bmm(inputs)

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ union XNodeUnion {`
`134`	`134`	`XNNConcatenate4: _XNNCat,`
`135`	`135`	`XNNStaticSlice,`
`136`	`136`	`XNNScaledDotProductAttention,`
	`137`	`+ XNNBatchMatrixMultiply: _XNNNode2x1,`
`137`	`138`	`}`
`138`	`139`
`139`	`140`	`union XValueUnion {`
Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,7 @@ union XNodeUnion {`
`130`	`130`	`XNNConcatenate4: _XNNCat,`
`131`	`131`	`XNNStaticSlice,`
`132`	`132`	`XNNScaledDotProductAttention,`
	`133`	`+ XNNBatchMatrixMultiply: _XNNNode2x1,`
`133`	`134`	`}`
`134`	`135`
`135`	`136`	`union XValueUnion {`