pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py‎
Lines changed: 27 additions & 3 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 3 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/arm/_passes/decompose_cumsum_pass.py‎
Lines changed: 0 additions & 142 deletions b/‎backends/arm/_passes/decompose_cumsum_pass.py‎
Lines changed: 0 additions & 142 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 50 additions & 5 deletions b/‎backends/arm/_passes/insert_rescales_pass.py‎
Lines changed: 50 additions & 5 deletions
@@ -33,7 +33,6 @@
 from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass  # noqa
 from .decompose_cosh_pass import DecomposeCoshPass  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
-from .decompose_cumsum_pass import DecomposeCumsumPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
 
@@ -14,12 +14,36 @@
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch.library import impl, Library
+
+# Define lib with passthrough operators. The operators have no real meaning in edge IR
+# except for argument validaiton and a passthrough output. The operators will be used
+# when lowering to TOSA, e.g. a passthrough_to_tosa._transpose will not affect
+# the edge IR graph but will be lowered to a TOSA-TRANSPOSE.
+lib = Library("passthrough_to_tosa", "DEF")
+# For certain operators we need the data in a specific data format. Changing tosa_dim_order
+# is not sufficient as we also need transpose the data.
+# By utilizing an edge IR passthrough operator we can keep the edge program in
+# channels-first/contiguous and get the desired behavior in the TOSA lowering.
+lib.define("_transpose(Tensor self, int[] dim_order) -> Tensor")
+
+
+@impl(lib, "_transpose")
+def _transpose_impl(*args, **kwargs):
+    # Validate length of dim_order array
+    dim = args[1]
+    if len(dim) != 4 and len(dim) != 5:
+        raise ValueError(
+            f"Dim order length must be either 4 or 5, got {len(dim)}: {dim}"
+        )
+    # Pass-through in edge-IR
+    return args[0]
 
 
 class AnnotateChannelsLastDimOrder(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
-    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts backend.tosa.TRANSPOSE
+    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes. The pass also inserts passthrough_to_tosa._transpose
     when a transition between 3D and 4D/5D tensors happen.
     The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
     """
@@ -95,7 +119,7 @@ def insert_input_transpose(node, input_node, graph_module):
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
-                exir_ops.backend.tosa.TRANSPOSE.default,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(
                     input_node,
                     list(
@@ -117,7 +141,7 @@ def insert_output_transpose(node, graph_module):
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
-                exir_ops.backend.tosa.TRANSPOSE.default,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(
                     node,
                     list(
 
@@ -38,7 +38,6 @@
     DecomposeBatchNormNoStatsPass,
     DecomposeCoshPass,
     DecomposeCosineSimilarityPass,
-    DecomposeCumsumPass,
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeExpm1Pass,
@@ -149,7 +148,6 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
-        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
@@ -229,7 +227,6 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
-        self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
 
@@ -107,7 +107,7 @@ def call(self, graph_module):
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
                 continue
-            if node.target == exir_ops.backend.tosa.TABLE.default:
+            if node.target == torch.ops.tosa._table.default:
                 continue
 
             input_nodes = node.all_input_nodes
 
@@ -3,25 +3,70 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from copy import copy
 from typing import cast
 
+import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch import Tensor
 from torch.fx import GraphModule, Node
+from torch.library import custom_op, register_fake
+
+logger = logging.getLogger(__name__)
+
+
+@custom_op("tosa::_rescale", mutates_args=())  # type: ignore[misc]
+def rescale(
+    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
+) -> Tensor:
+    logger.warning(
+        "Ran default implementation of tosa::_rescale."
+        "This op is meant to always be inserted inside a partition and a correct default implementation is not implemented."
+    )
+    # Clone is needed to not return reference when rescaling to same dtype.
+    # This is a neccessary requirement for non-mutating custom ops.
+    return x.to(dtype=dtype).clone()
+
+
+@register_fake("tosa::_rescale")  # type: ignore[misc]
+def rescale_fake(
+    x: Tensor, dtype: torch.dtype, scale: float, in_zp: int, out_zp: int
+) -> Tensor:
+    """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
+    Additionally validates TOSA constraints of a RESCALE op.
+    """
+    if dtype not in (torch.int32, torch.int8, torch.int16):
+        raise NotImplementedError(
+            f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
+        )
+    if dtype in (torch.int32, torch.int16) and out_zp != 0:
+        raise ValueError(
+            f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
+        )
+    if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
+        raise ValueError(
+            f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
+        )
+    if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
+        raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
+    if dtype == torch.int8 and not -128 <= out_zp <= 127:
+        raise ValueError(f"{out_zp=} outside valid range (-128,127) for int8.")
+
+    return x.to(dtype=dtype).clone()
 
 
 class InsertRescalePass(ExportPass):
     """Finds patterns of dq -> q, and replaces them
-    with backend dialect tosa::RESCALE op.
+    with passthrough_to_tosa::rescales.
 
-    Does not guarantee that the dtypes and zero points are valid
+    Does not garantuee that the dtypes and zero points are valid
     in TOSA, that is the job of the quantization annotator that
     produced the dq and q nodes. The TOSA constraints are validated
-    in the fake implementation of.
+    in the fake implementation of passthrough_to_tosa:rescale.
     """
 
     def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule):
@@ -32,7 +77,7 @@ def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule
         with graph_module.graph.inserting_before(node):
             rescale_node = create_node(
                 graph_module.graph,
-                exir_ops.backend.tosa.RESCALE.default,
+                torch.ops.tosa._rescale.default,
                 (
                     node.all_input_nodes[0],
                     q_args.dtype,