pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 8 additions & 6 deletions b/‎CMakeLists.txt
Lines changed: 8 additions & 6 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/multiarray.mm
Lines changed: 4 additions & 1 deletion b/‎backends/apple/coreml/runtime/delegate/multiarray.mm
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 3 additions & 27 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 3 additions & 27 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_cumsum_pass.py
Lines changed: 142 additions & 0 deletions b/‎backends/arm/_passes/decompose_cumsum_pass.py
Lines changed: 142 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 5 additions & 50 deletions b/‎backends/arm/_passes/insert_rescales_pass.py
Lines changed: 5 additions & 50 deletions
@@ -309,9 +309,15 @@ set(_common_include_directories
 )
 
 #
-# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
+# The `_<target>_srcs` lists are defined by executorch_load_build_variables.
 #
-
+if(EXECUTORCH_SRCS_FILE)
+  message(
+    WARNING
+      "EXECUTORCH_SRCS_FILE is no longer necessary and will not affect the build."
+  )
+endif()
+executorch_load_build_variables()
 if(NOT EXECUTORCH_SRCS_FILE)
   # A file wasn't provided. Run a script to extract the source lists from the
   # buck2 build system and write them to a file we can include.
@@ -324,10 +330,6 @@ if(NOT EXECUTORCH_SRCS_FILE)
   executorch_validate_build_variables()
 endif()
 
-# This file defines the `_<target>__srcs` variables used below.
-message(STATUS "executorch: Using sources file ${EXECUTORCH_SRCS_FILE}")
-include(${EXECUTORCH_SRCS_FILE})
-
 # Detect if an iOS toolchain is set.
 if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   set(CMAKE_TOOLCHAIN_IOS ON)
 
@@ -124,7 +124,10 @@ bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArr
 
 bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
     if (src.layout().dataType() != dst.layout().dataType()) {
-        return false;
+        // Copying from FP16 to FP32 is supported and this is a common use case
+        if (!(src.layout().dataType() == MultiArray::DataType::Float16 && dst.layout().dataType() == MultiArray::DataType::Float32)) {
+            return false;
+        }
     }
     if (dst.layout().num_bytes() < src.layout().num_bytes()) {
         return false;
 
@@ -33,6 +33,7 @@
 from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass  # noqa
 from .decompose_cosh_pass import DecomposeCoshPass  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
+from .decompose_cumsum_pass import DecomposeCumsumPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
 
@@ -14,36 +14,12 @@
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.library import impl, Library
-
-# Define lib with passthrough operators. The operators have no real meaning in edge IR
-# except for argument validaiton and a passthrough output. The operators will be used
-# when lowering to TOSA, e.g. a passthrough_to_tosa._transpose will not affect
-# the edge IR graph but will be lowered to a TOSA-TRANSPOSE.
-lib = Library("passthrough_to_tosa", "DEF")
-# For certain operators we need the data in a specific data format. Changing tosa_dim_order
-# is not sufficient as we also need transpose the data.
-# By utilizing an edge IR passthrough operator we can keep the edge program in
-# channels-first/contiguous and get the desired behavior in the TOSA lowering.
-lib.define("_transpose(Tensor self, int[] dim_order) -> Tensor")
-
-
-@impl(lib, "_transpose")
-def _transpose_impl(*args, **kwargs):
-    # Validate length of dim_order array
-    dim = args[1]
-    if len(dim) != 4 and len(dim) != 5:
-        raise ValueError(
-            f"Dim order length must be either 4 or 5, got {len(dim)}: {dim}"
-        )
-    # Pass-through in edge-IR
-    return args[0]
 
 
 class AnnotateChannelsLastDimOrder(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
-    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts passthrough_to_tosa._transpose
+    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts backend.tosa.TRANSPOSE
     when a transition between 3D and 4D/5D tensors happen.
     The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
     """
@@ -119,7 +95,7 @@ def insert_input_transpose(node, input_node, graph_module):
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose.default,
+                exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     input_node,
                     list(
@@ -141,7 +117,7 @@ def insert_output_transpose(node, graph_module):
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose.default,
+                exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     node,
                     list(
 
@@ -38,6 +38,7 @@
     DecomposeBatchNormNoStatsPass,
     DecomposeCoshPass,
     DecomposeCosineSimilarityPass,
+    DecomposeCumsumPass,
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeExpm1Pass,
@@ -148,6 +149,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
+        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
@@ -227,6 +229,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
+        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
 
@@ -0,0 +1,142 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from math import prod
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.quant_args import QuantArgs
+
+from executorch.backends.transforms.utils import create_constant_placeholder
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import PassResult
+from torch.export.graph_signature import InputKind
+
+
+class DecomposeCumsumPass(ArmPass):
+    """
+    Decomposes cumsum into a 1D convolution with a kernel of ones.
+
+    For example, the cumsum of an input tensor [1, 1] is [1, 1 + 1] = [1, 2].
+    To decompose this, take the input tensor and pre-padded with len(input)-1 zeros and
+    slided over with a kernel [1,1], of length len(input):
+
+    Input:  [0, 1, 1]
+    Kernel: [1, 1]       = [1]
+               [1, 1]    = [2]
+
+    Since pytorch only supports symmetric padding, in reality the result will have
+    an additional 1 calculated at the end, which leads to an required extra slice op.
+
+    To extend this to higher dimensions, the input is reshaped to [N, C, H, W] with
+       N = <dims before cumsum dim>
+       C = 1
+       H = <cumsum dim>
+       W = <dims after cumsum dim>
+    And the convolution is applied over dimension H.
+    """
+
+    def call(self, graph_module):
+        graph = graph_module.graph
+        targets = (exir_ops.edge.aten.cumsum.default, torch.ops.aten.cumsum.default)
+        modified = False
+        for node in list(graph.nodes):
+            if node.op != "call_function" or node.target not in targets:
+                continue
+
+            if len(node.args) != 2:
+                raise ValueError(
+                    "Cumsum node should have exactly two arguments: input and dim."
+                )
+
+            # Get node data
+            input_node, dim = node.args
+            val = node.meta.get("val")
+            original_shape = list(val.shape)
+            dtype = input_node.meta.get("val").dtype
+            dim = dim % len(original_shape)
+
+            # Compute shapes
+            pre_cumsum_dim = prod(original_shape[:dim]) if dim > 0 else 1
+            cumsum_dim = original_shape[dim]
+            post_cumsum_dim = (
+                prod(original_shape[dim + 1 :]) if dim < len(original_shape) - 1 else 1
+            )
+            conv_shape = [
+                pre_cumsum_dim,
+                1,
+                cumsum_dim,
+                post_cumsum_dim,
+            ]
+            pad_shape = [original_shape[dim] - 1, 0]
+            weight_shape = [1, 1, original_shape[dim], 1]
+
+            # Create convolution weight
+            with graph.inserting_before(list(graph.nodes)[0]):
+                weight_data = torch.ones(size=weight_shape, dtype=dtype)
+                weight_node = create_constant_placeholder(
+                    self.exported_program,
+                    graph,
+                    node.name + "_kernel",
+                    InputKind.PARAMETER,
+                    weight_data,
+                )
+
+            # Create decomposed nodes
+            view_op = exir_ops.edge.aten.view_copy.default
+            conv_op = exir_ops.edge.aten.convolution.default
+            slice_op = exir_ops.edge.aten.slice_copy.Tensor
+            with graph.inserting_before(node):
+                # Reshape to 4D with
+                view_args = (input_node, conv_shape)
+                view_node = create_node(graph, view_op, args=view_args, from_node=node)
+
+                conv_args = (
+                    view_node,
+                    weight_node,
+                    None,
+                    [1, 1],
+                    pad_shape,
+                    [1, 1],
+                    False,
+                    [0],
+                    1,
+                )
+                conv_node = create_node(graph, conv_op, args=conv_args, from_node=node)
+
+                # The convolution is inserted after quantization, so we need to set our
+                # own quantization parameters for the weights here. However since the
+                # data is ones directly created as int8, they already have correct scale
+                # and so no scaling needs to be done, i.e. set scale=1.0, zero_point=0.0
+                if (
+                    "input_qparams" in conv_node.meta
+                    and len(conv_node.meta["input_qparams"]) > 0
+                ):
+                    qparams = QuantArgs(1.0, 0.0, -128, 127, torch.int8)
+                    conv_node.meta["input_qparams"][1] = qparams
+
+                slice_args = (conv_node, 2, 0, original_shape[dim])
+                slice_node = create_node(
+                    graph, slice_op, args=slice_args, from_node=node
+                )
+
+                view_original_args = (slice_node, original_shape)
+                view_original_node = create_node(
+                    graph, view_op, args=view_original_args, from_node=node
+                )
+
+            # Replace and remove original
+            node.replace_all_uses_with(view_original_node)
+            graph.erase_node(node)
+            modified = True
+
+        if modified:
+            # Cleanup
+            graph.eliminate_dead_code()
+            graph_module.recompile()
+            # Apply any operator-level transforms
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
@@ -107,7 +107,7 @@ def call(self, graph_module):
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
                 continue
-            if node.target == torch.ops.tosa._table.default:
+            if node.target == exir_ops.backend.tosa.TABLE.default:
                 continue
 
             input_nodes = node.all_input_nodes
 
@@ -3,70 +3,25 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 from copy import copy
 from typing import cast
 
-import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch import Tensor
 from torch.fx import GraphModule, Node
-from torch.library import custom_op, register_fake
-
-logger = logging.getLogger(__name__)
-
-
-@custom_op("tosa::_rescale", mutates_args=())  # type: ignore[misc]
-def rescale(
-    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
-) -> Tensor:
-    logger.warning(
-        "Ran default implementation of tosa::_rescale."
-        "This op is meant to always be inserted inside a partition and a correct default implementation is not implemented."
-    )
-    # Clone is needed to not return reference when rescaling to same dtype.
-    # This is a neccessary requirement for non-mutating custom ops.
-    return x.to(dtype=dtype).clone()
-
-
-@register_fake("tosa::_rescale")  # type: ignore[misc]
-def rescale_fake(
-    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
-) -> Tensor:
-    """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
-    Additionally validates TOSA constraints of a RESCALE op.
-    """
-    if dtype not in (torch.int32, torch.int8, torch.int16):
-        raise NotImplementedError(
-            f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
-        )
-    if dtype in (torch.int32, torch.int16) and out_zp != 0:
-        raise ValueError(
-            f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
-        )
-    if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
-        raise ValueError(
-            f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
-        )
-    if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
-        raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
-    if dtype == torch.int8 and not -128 <= out_zp <= 127:
-        raise ValueError(f"{out_zp=} outside valid range (-128,127) for int8.")
-
-    return x.to(dtype=dtype).clone()
 
 
 class InsertRescalePass(ExportPass):
     """Finds patterns of dq -> q, and replaces them
-    with passthrough_to_tosa::rescales.
+    with backend dialect tosa::RESCALE op.
 
-    Does not garantuee that the dtypes and zero points are valid
+    Does not guarantee that the dtypes and zero points are valid
     in TOSA, that is the job of the quantization annotator that
     produced the dq and q nodes. The TOSA constraints are validated
-    in the fake implementation of passthrough_to_tosa:rescale.
+    in the fake implementation of.
     """
 
     def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule):
@@ -77,7 +32,7 @@ def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule
         with graph_module.graph.inserting_before(node):
             rescale_node = create_node(
                 graph_module.graph,
-                torch.ops.tosa._rescale.default,
+                exir_ops.backend.tosa.RESCALE.default,
                 (
                     node.all_input_nodes[0],
                     q_args.dtype,