Arm backend: Add support for grouped convolution (#11817)

AdrianLundell · web-flow · commit 28b81983d860 · 2025-06-19T16:12:17.000+02:00
Grouped convolution is lowered as separate convolutions on different
slices of the input and weights in a new DecomposeGroupedConv pass.
Tested in two new tests in test_conv2d.

Fuse constant ops pass is additionally updated to make sure all removed
placeholders are deleted.

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -24,6 +24,7 @@
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
+from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -27,6 +27,7 @@
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeGeluPass,
+    DecomposeGroupedConv,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
@@ -117,6 +118,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
+        self.add_pass(DecomposeGroupedConv())
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
@@ -174,6 +176,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(MatchArgRanksPass(exported_program))
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
+        self.add_pass(DecomposeGroupedConv())
         self.add_pass(RemoveClonePass())
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
diff --git a/backends/arm/_passes/decompose_grouped_conv.py b/backends/arm/_passes/decompose_grouped_conv.py
@@ -0,0 +1,134 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from copy import copy
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeGroupedConv(ExportPass):
+    """
+    Splits a grouped convolution which is not supported by TOSA into multiple
+    convolutions using slice->conv->cat.
+
+    Before pass:
+        x = conv(input, weight, bias, groups = 2)
+
+    After pass:
+        input1 = slice(input)
+        weight1 = slice(weight)
+        bias1 = slice(bias)
+        x1 = conv(input1, weight1, bias1)
+
+        input2 = slice(input)
+        weight2 = slice(weight)
+        bias2 = slice(bias)
+        x2 = conv(input2, weight2, bias2)
+
+        x = cat(x1, x2)
+    """
+
+    @staticmethod
+    def _get_decomposition(op):
+        match op:
+            case exir_ops.edge.aten.convolution.default:
+                return (
+                    exir_ops.edge.aten.slice_copy.Tensor,
+                    exir_ops.edge.aten.convolution.default,
+                    exir_ops.edge.aten.cat.default,
+                )
+            case torch.ops.aten.conv2d.default:
+                return (
+                    torch.ops.aten.slice_copy.Tensor,
+                    torch.ops.aten.conv2d.default,
+                    torch.ops.aten.cat.default,
+                )
+            case _:
+                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op == exir_ops.edge.aten.convolution.default:
+            groups = args[8]
+            transposed = args[6]
+        elif op == torch.ops.aten.conv2d.default:
+            groups = args[6]
+            transposed = False
+        else:
+            return super().call_operator(op, args, kwargs, meta)
+
+        if groups == 1 or transposed:
+            return super().call_operator(op, args, kwargs, meta)
+
+        input_node = args[0]
+        if input_node.data.shape[1] == groups:
+            # This is a depthwise convolution which is handled elsewhere
+            return super().call_operator(op, args, kwargs, meta)
+
+        weight_node = args[1]
+        bias_node = args[2]
+
+        input_slice_size = weight_node.data.shape[1]
+        output_slice_size = weight_node.data.shape[0] // groups
+
+        no_q_dq_meta = copy(meta)
+        no_q_dq_meta.data = {}
+        no_q_dq_meta.data = {}
+
+        slice_op, conv_op, cat_op = DecomposeGroupedConv._get_decomposition(op)
+
+        input_slices = []
+        for i in range(groups):
+            start_index = i * input_slice_size
+            stop_index = (i + 1) * input_slice_size
+            slice_args = (input_node, 1, start_index, stop_index)
+
+            input_slices.append(
+                super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+            )
+
+        filter_slices = []
+        for i in range(groups):
+            start_index = i * output_slice_size
+            stop_index = (i + 1) * output_slice_size
+            slice_args = (weight_node, 0, start_index, stop_index)
+
+            filter_slices.append(
+                super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+            )
+
+        bias_slices = []
+        for i in range(groups):
+            if bias_node is None:
+                bias_slices.append(None)
+            else:
+
+                start_index = i * output_slice_size
+                stop_index = (i + 1) * output_slice_size
+                slice_args = (bias_node, 0, start_index, stop_index)
+
+                bias_slices.append(
+                    super().call_operator(slice_op, slice_args, kwargs, no_q_dq_meta)
+                )
+
+        output_slices = []
+        for input_slice, filter_slice, bias_slice in zip(
+            input_slices, filter_slices, bias_slices
+        ):
+
+            if op == exir_ops.edge.aten.convolution.default:
+                conv_args = (input_slice, filter_slice, bias_slice, *args[3:8], 1)
+            elif op == torch.ops.aten.conv2d.default:
+                conv_args = (input_slice, filter_slice, bias_slice, *args[3:6], 1)
+            else:
+                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+
+            output_slices.append(
+                super().call_operator(conv_op, conv_args, kwargs, meta)
+            )
+
+        cat_args = (output_slices, 1)
+        return super().call_operator(cat_op, cat_args, kwargs, no_q_dq_meta)
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -98,7 +98,7 @@ def _fuse_nodes(self, node) -> bool:
 
     def call(self, graph_module):
         modified = False
-        input_nodes_to_delete = []
+        input_nodes_to_maybe_delete = set()
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
                 continue
@@ -128,22 +128,17 @@ def call(self, graph_module):
                     )
                     modified |= did_fuse
                     graph_module.recompile()  # Recompile needed to catch chains of constant ops
-                    input_nodes_to_delete.extend(
-                        [
-                            input_node
-                            for input_node in input_nodes
-                            if len(input_node.users) == 1
-                        ]
-                    )
+                    input_nodes_to_maybe_delete.update(input_nodes)
             except Exception as e:
                 logger.warning(
                     f"\nFailed to fuse constant op {node.name} due to exception:\n{str(e)}"
                 )
 
         if modified:
             graph_module.graph.eliminate_dead_code()
-            for input_node in input_nodes_to_delete:
-                delete_constant_placeholder(self.exported_program, input_node)
+            for input_node in input_nodes_to_maybe_delete:
+                if len(input_node.users) == 0:
+                    delete_constant_placeholder(self.exported_program, input_node)
 
             graph_module = super().call(graph_module).graph_module
 
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -327,6 +327,34 @@ def forward(self, x):
     batches=1,
 )
 
+conv2d_groups = Conv2d(
+    in_channels=12,
+    out_channels=9,
+    kernel_size=(3, 3),
+    stride=1,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+    groups=3,
+    bias=False,
+)
+
+conv2d_groups_bias = Conv2d(
+    in_channels=15,
+    out_channels=5,
+    kernel_size=(3, 3),
+    stride=1,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+    groups=5,
+    bias=True,
+)
+
 # Shenanigan to get a nicer output when test fails. With unittest it looks like:
 # FAIL: test_convolution_2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
 test_modules = {
@@ -348,6 +376,8 @@ def forward(self, x):
     "3x3_1x3x224x224_st2_pd1": lambda: conv2d_3x3_1x3x224x224_st2_pd1,
     "two_conv2d_nobias": lambda: two_conv2d_nobias,
     "two_conv2d": lambda: two_conv2d,
+    "groups": lambda: conv2d_groups,
+    "groups_bias": lambda: conv2d_groups_bias,
 }
 
 fvp_xfails = {