Arm backend: Add cumsum support (#13457)

AdrianLundell · web-flow · commit f4fa279ccbeb · 2025-08-18T13:02:30.000+02:00
Decompose cumsum as a convolution with a kernel of ones.

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -33,6 +33,7 @@
 from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass  # noqa
 from .decompose_cosh_pass import DecomposeCoshPass  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
+from .decompose_cumsum_pass import DecomposeCumsumPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -38,6 +38,7 @@
     DecomposeBatchNormNoStatsPass,
     DecomposeCoshPass,
     DecomposeCosineSimilarityPass,
+    DecomposeCumsumPass,
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeExpm1Pass,
@@ -148,6 +149,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
+        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
@@ -227,6 +229,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
+        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
diff --git a/backends/arm/_passes/decompose_cumsum_pass.py b/backends/arm/_passes/decompose_cumsum_pass.py
@@ -0,0 +1,142 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from math import prod
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.quant_args import QuantArgs
+
+from executorch.backends.transforms.utils import create_constant_placeholder
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+from torch.export.graph_signature import InputKind
+
+
+class DecomposeCumsumPass(ArmPass):
+    """
+    Decomposes cumsum into a 1D convolution with a kernel of ones.
+
+    For example, the cumsum of an input tensor [1, 1] is [1, 1 + 1] = [1, 2].
+    To decompose this, take the input tensor and pre-padded with len(input)-1 zeros and
+    slided over with a kernel [1,1], of length len(input):
+
+    Input:  [0, 1, 1]
+    Kernel: [1, 1]       = [1]
+               [1, 1]    = [2]
+
+    Since pytorch only supports symmetric padding, in reality the result will have
+    an additional 1 calculated at the end, which leads to an required extra slice op.
+
+    To extend this to higher dimensions, the input is reshaped to [N, C, H, W] with
+       N = <dims before cumsum dim>
+       C = 1
+       H = <cumsum dim>
+       W = <dims after cumsum dim>
+    And the convolution is applied over dimension H.
+    """
+
+    def call(self, graph_module):
+        graph = graph_module.graph
+        targets = (exir_ops.edge.aten.cumsum.default, torch.ops.aten.cumsum.default)
+        modified = False
+        for node in list(graph.nodes):
+            if node.op != "call_function" or node.target not in targets:
+                continue
+
+            if len(node.args) != 2:
+                raise ValueError(
+                    "Cumsum node should have exactly two arguments: input and dim."
+                )
+
+            # Get node data
+            input_node, dim = node.args
+            val = node.meta.get("val")
+            original_shape = list(val.shape)
+            dtype = input_node.meta.get("val").dtype
+            dim = dim % len(original_shape)
+
+            # Compute shapes
+            pre_cumsum_dim = prod(original_shape[:dim]) if dim > 0 else 1
+            cumsum_dim = original_shape[dim]
+            post_cumsum_dim = (
+                prod(original_shape[dim + 1 :]) if dim < len(original_shape) - 1 else 1
+            )
+            conv_shape = [
+                pre_cumsum_dim,
+                1,
+                cumsum_dim,
+                post_cumsum_dim,
+            ]
+            pad_shape = [original_shape[dim] - 1, 0]
+            weight_shape = [1, 1, original_shape[dim], 1]
+
+            # Create convolution weight
+            with graph.inserting_before(list(graph.nodes)[0]):
+                weight_data = torch.ones(size=weight_shape, dtype=dtype)
+                weight_node = create_constant_placeholder(
+                    self.exported_program,
+                    graph,
+                    node.name + "_kernel",
+                    InputKind.PARAMETER,
+                    weight_data,
+                )
+
+            # Create decomposed nodes
+            view_op = exir_ops.edge.aten.view_copy.default
+            conv_op = exir_ops.edge.aten.convolution.default
+            slice_op = exir_ops.edge.aten.slice_copy.Tensor
+            with graph.inserting_before(node):
+                # Reshape to 4D with
+                view_args = (input_node, conv_shape)
+                view_node = create_node(graph, view_op, args=view_args, from_node=node)
+
+                conv_args = (
+                    view_node,
+                    weight_node,
+                    None,
+                    [1, 1],
+                    pad_shape,
+                    [1, 1],
+                    False,
+                    [0],
+                    1,
+                )
+                conv_node = create_node(graph, conv_op, args=conv_args, from_node=node)
+
+                # The convolution is inserted after quantization, so we need to set our
+                # own quantization parameters for the weights here. However since the
+                # data is ones directly created as int8, they already have correct scale
+                # and so no scaling needs to be done, i.e. set scale=1.0, zero_point=0.0
+                if (
+                    "input_qparams" in conv_node.meta
+                    and len(conv_node.meta["input_qparams"]) > 0
+                ):
+                    qparams = QuantArgs(1.0, 0.0, -128, 127, torch.int8)
+                    conv_node.meta["input_qparams"][1] = qparams
+
+                slice_args = (conv_node, 2, 0, original_shape[dim])
+                slice_node = create_node(
+                    graph, slice_op, args=slice_args, from_node=node
+                )
+
+                view_original_args = (slice_node, original_shape)
+                view_original_node = create_node(
+                    graph, view_op, args=view_original_args, from_node=node
+                )
+
+            # Replace and remove original
+            node.replace_all_uses_with(view_original_node)
+            graph.erase_node(node)
+            modified = True
+
+        if modified:
+            # Cleanup
+            graph.eliminate_dead_code()
+            graph_module.recompile()
+            # Apply any operator-level transforms
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -169,6 +169,7 @@ def is_node_supported(
             exir_ops.edge.aten.cat.default,
             exir_ops.edge.aten.ceil.default,
             exir_ops.edge.aten.clamp.default,
+            exir_ops.edge.aten.cumsum.default,
             exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
             exir_ops.edge.aten.hardsigmoid.default,
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -290,6 +290,7 @@ def _match_pattern(
     torch.ops.aten.asinh.default,
     torch.ops.aten.cosh.default,
     torch.ops.aten.acos.default,
+    torch.ops.aten.cumsum.default,
 ]
 
 _one_to_one_shared_input_qspec = [
diff --git a/backends/arm/test/ops/test_cumsum.py b/backends/arm/test/ops/test_cumsum.py
@@ -0,0 +1,122 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t1 = Tuple[torch.Tensor, int]
+aten_op = "torch.ops.aten.cumsum.default"
+
+"""
+Tests the aten.cumsum operator by decomposing it into a convolution and
+verifying results across various dims and pipelines.
+"""
+
+
+class CumsumModule(torch.nn.Module):
+    test_parameters = {
+        "1d_dim0": lambda: (torch.rand(10), 0),
+        "1d_dim_neg1": lambda: (torch.rand(10), -1),
+        "2d_dim1": lambda: (torch.rand(5, 6), 1),
+        "3d_dim2": lambda: (torch.rand(2, 3, 4), 2),
+        "3d_dim0": lambda: (torch.rand(2, 3, 4), 0),
+        "4d_dim3": lambda: (torch.rand(1, 2, 3, 4), 3),
+        "4d_dim1": lambda: (torch.rand(1, 2, 3, 4), 1),
+    }
+
+    def forward(self, x: torch.Tensor, dim: int) -> torch.Tensor:
+        return torch.cumsum(x, dim)
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+def test_cumsum_tosa_FP(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = TosaPipelineFP[input_t1](
+        module,
+        args,
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+def test_cumsum_tosa_INT(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = TosaPipelineINT[input_t1](
+        module,
+        args,
+        aten_op,
+        exir_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+@common.SkipIfNoModelConverter
+def test_cumsum_vgf_FP(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = VgfPipeline[input_t1](
+        module,
+        args,
+        aten_op,
+        tosa_version="TOSA-1.0+FP",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+@common.SkipIfNoModelConverter
+def test_cumsum_vgf_INT(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = VgfPipeline[input_t1](
+        module,
+        args,
+        aten_op,
+        tosa_version="TOSA-1.0+INT",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+@common.XfailIfNoCorstone300
+def test_cumsum_u55_INT(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = EthosU55PipelineINT[input_t1](
+        module,
+        args,
+        aten_ops=aten_op,
+        exir_ops=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", CumsumModule.test_parameters)
+@common.XfailIfNoCorstone320
+def test_cumsum_u85_INT(test_data: input_t1):
+    module = CumsumModule()
+    args = test_data()
+    pipeline = EthosU85PipelineINT[input_t1](
+        module,
+        args,
+        aten_ops=aten_op,
+        exir_ops=[],
+    )
+    pipeline.run()

Original file line number	Diff line number	Diff line change
`@@ -290,6 +290,7 @@ def _match_pattern(`
`290`	`290`	`torch.ops.aten.asinh.default,`
`291`	`291`	`torch.ops.aten.cosh.default,`
`292`	`292`	`torch.ops.aten.acos.default,`
	`293`	`+ torch.ops.aten.cumsum.default,`
`293`	`294`	`]`
`294`	`295`
`295`	`296`	`_one_to_one_shared_input_qspec = [`