pytorch
diff --git a/‎.github/workflows/apple.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 13 additions & 0 deletions b/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎backends/arm/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/TARGETS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 12 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_floor_divide_pass.py‎
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_floor_divide_pass.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 12 additions & 32 deletions b/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 12 additions & 32 deletions
diff --git a/‎backends/arm/_passes/decompose_remainder_pass.py‎
Lines changed: 66 additions & 0 deletions b/‎backends/arm/_passes/decompose_remainder_pass.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_sum_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/decompose_sum_pass.py‎
Lines changed: 2 additions & 2 deletions
@@ -38,7 +38,7 @@ jobs:
         id: set_version
         shell: bash
         run: |
-          VERSION="0.8.0.$(TZ='PST8PDT' date +%Y%m%d)"
+          VERSION="1.1.0.$(TZ='PST8PDT' date +%Y%m%d)"
           echo "version=$VERSION" >> "$GITHUB_OUTPUT"
 
   build-demo-ios:
 
@@ -15,6 +15,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
 from torch.export.exported_program import ExportedProgram
 
 
@@ -61,6 +62,18 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         tag_constant_data(exported_program)
         tag_mutated_buffer(exported_program)
 
+        # Tag constant placeholders that have no users
+        # tag_constant_data only tags constants that have users with delegation_tag
+        # but we need to tag all constants for this partition
+        for node in exported_program.graph.nodes:
+            if node.op == "placeholder" and (
+                is_param(exported_program, node)
+                or is_buffer(exported_program, node)
+                or is_lifted_tensor_constant(exported_program, node)
+            ):
+                if "delegation_tag" not in node.meta:
+                    node.meta["delegation_tag"] = tag
+
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
 
@@ -20,6 +20,7 @@ runtime.python_library(
     srcs = [
         "common/__init__.py",
         "common/debug.py",
+        "common/type.py",
     ],
     deps = [
         "fbsource//third-party/tosa_tools:serializer",
 
@@ -42,6 +42,7 @@
 from .decompose_elu_pass import DecomposeEluPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
+from .decompose_floor_divide_pass import DecomposeFloorDividePass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
@@ -58,6 +59,7 @@
 from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
+from .decompose_remainder_pass import DecomposeRemainderPass  # noqa
 from .decompose_round_pass import DecomposeRoundPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
 from .decompose_sign_pass import DecomposeSignPass  # noqa
@@ -75,6 +77,7 @@
 )
 from .fuse_batchnorm2d_pass import FuseBatchnorm2DPass  # noqa
 from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
+from .fuse_duplicate_users_pass import FuseDuplicateUsersPass  # noqa
 from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass  # noqa
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
 from .insert_int32_casts_after_int64_placeholders import (  # noqa
 
@@ -49,6 +49,7 @@
     DecomposeEluPass,
     DecomposeEmbeddingPass,
     DecomposeExpm1Pass,
+    DecomposeFloorDividePass,
     DecomposeGeluPass,
     DecomposeGluPass,
     DecomposeGroupedConv,
@@ -62,6 +63,7 @@
     DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
+    DecomposeRemainderPass,
     DecomposeRoundPass,
     DecomposeSelectPass,
     DecomposeSignPass,
@@ -76,6 +78,7 @@
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
     FuseConstantArgsPass,
+    FuseDuplicateUsersPass,
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
@@ -175,6 +178,7 @@ def _tosa_INT_pipeline(
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(ConvertELUParamsPass())
         self.add_pass(FoldAndAnnotateQParamsPass(exported_program))  # type: ignore[call-arg]
+        self.add_pass(FuseDuplicateUsersPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
         if self.tosa_spec.is_U55_subset:
@@ -209,6 +213,7 @@ def _tosa_INT_pipeline(
         self.add_pass(RewriteMatmulPass())
         self.add_pass(RewriteUpsamplePass())
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
+
         self.add_pass(InsertRescaleInt32Pass())
         self.add_pass(DecomposeSumPass())
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
@@ -222,6 +227,7 @@ def _tosa_FP_pipeline(
         self, exported_program: ExportedProgram, graph_module: GraphModule
     ) -> GraphModule:
         self.add_pass(AnnotateOutputDimOrderPass())
+        self.add_pass(FuseDuplicateUsersPass())
         self.add_pass(DecomposeExpm1Pass())
         self.add_pass(DecomposeLogitPass())
         self.add_pass(DecomposeMaskedFill())
@@ -240,8 +246,11 @@ def _tosa_FP_pipeline(
         self.add_pass(CastBoolToInt8Pass())
         self.add_pass(DecomposeSinhPass())
         self.add_pass(DecomposeSignPass())
+        self.add_pass(DecomposeFloorDividePass())
         self.add_pass(DecomposeDivTensorModePass())
         self.add_pass(ReplaceScalarWithTensorByProfilePass())
+        self.add_pass(DecomposeRemainderPass())
+        self.add_pass(DecomposeDivTensorModePass())
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
@@ -331,9 +340,11 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(CastBoolToInt8Pass())
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeAddmmPass())
+        self.add_pass(ReplaceScalarWithTensorByProfilePass())
+        self.add_pass(DecomposeRemainderPass())
+        self.add_pass(DecomposeFloorDividePass())
         self.add_pass(DecomposeDivTensorModePass())
         self.add_pass(DecomposeAddSubAlphaPass())
-        self.add_pass(ReplaceScalarWithTensorByProfilePass())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
         self.add_pass(DecomposeLayerNormPass())
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_div_tensor_mode import (
+    DecomposeDivTensorModePass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+edge_floor_divide_ops = (exir_ops.edge.aten.floor_divide.default,)
+aten_floor_divide_ops = (torch.ops.aten.floor_divide.default,)
+
+
+def get_floor_divide_decomposition(op) -> tuple:
+    """
+    Returns the decomposition of the given aten.floor_div operation into
+    its equivalent TOSA-supported operations
+
+    This handles both edge dialect ops and core PyTorch ops. The decomposition strategy
+    is:
+        floor_div(x, y) → div_tensor_mode(x, y, rounding_mode="floor")
+
+    Returns:
+        A tuple (div_op,) corresponding to the appropriate operator overload for the input op.
+
+    Raises:
+        RuntimeError: If the provided operator is not a supported floor_divide variant.
+    """
+
+    if op in edge_floor_divide_ops:
+        return (
+            exir_ops.edge.aten.div.Tensor_mode,
+            exir_ops.edge.aten.full_like.default,
+        )
+    if op in aten_floor_divide_ops:
+        return (
+            torch.ops.aten.div.Tensor_mode,
+            torch.ops.aten.full_like.default,
+        )
+
+    raise RuntimeError(f"Can't get floor_div decomposition for op {op}")
+
+
+class DecomposeFloorDividePass(ArmPass):
+    """
+    Decomposes aten.floor_divide into aten.div.Tensor_mode with rounding_mode="floor".
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivTensorModePass}
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (edge_floor_divide_ops + aten_floor_divide_ops):
+            return super().call_operator(op, args, kwargs, meta, updated=False)
+
+        (div_op, full_op) = get_floor_divide_decomposition(op)
+
+        input = args[0]
+        other = args[1]
+
+        if isinstance(other, int):
+            other = super().call_operator(
+                full_op, (input, other), {}, meta, updated=False
+            )
+
+        div_node = super().call_operator(
+            div_op, (input, other), {"rounding_mode": "floor"}, meta, updated=True
+        )
+
+        return div_node
@@ -49,12 +49,8 @@ def call_operator(self, op, args, kwargs, meta):
             )
 
         # convolution with bias and activation is int16
-        # The bias is assumed to be quantized with the same quantization parameters as
-        # as the output of the convolution
         bias = args[2]
-        assert (
-            meta.data["output_qparams"][0].dtype == bias.data.dtype
-        ), "Bias needs to have same type as quantized output type"
+
         no_bias_args = list(args)
         no_bias_args[2] = None
         # split up to convolution + bias
@@ -79,46 +75,30 @@ def call_operator(self, op, args, kwargs, meta):
             # The conv will get the output int48 scaled to int32 in serialization step.
             # To be able to add the bias we need to first scale (cast?) the output to int32.
             # The resulting i32 sum will then need to be scaled back to the output dtype.
-
-            # calculate common rescale factor from convolution output and bias quantization
             output_qparams = cast(QuantArgs, meta.data["output_qparams"][0])
             conv_output_scale = output_qparams.scale
-            bias_qparams = cast(QuantArgs, meta.data["input_qparams"][2])
-            bias_scale = bias_qparams.scale
 
-            common_scale = max(bias_scale, conv_output_scale)
-
-            # calculate how we can rescale bias and conv to a common scale and maximize the output range
-            bias_rescale_factor = bias_scale / common_scale
-            conv_rescale_factor = conv_output_scale / common_scale
+            bias_qparams = cast(QuantArgs, meta.data["input_qparams"][2])
+            per_channel_quant = bias_qparams.per_channel
 
-            # Either of conv output or bias now covers the full int16 range and the other one a smaller range.
-            # Since we are upscaling to int32 we have 16 additional bits to work with to maximize the output range.
-            # Worst case here is that both bias and conv output covers the full int16 range so we leave one bit
-            # and then one for the sign bit.
-            bits_left_to_shift = 14
+            if per_channel_quant:
+                bias_scale = bias_qparams.get_scale_per_channel()
+            else:
+                bias_scale = [bias_qparams.get_scale_per_tensor()]
 
-            # update rescale factors
-            bias_rescale_factor *= 1 << bits_left_to_shift
-            conv_rescale_factor *= 1 << bits_left_to_shift
+            conv_rescale_factors = [1.0] * len(bias_scale)
+            final_output_scale = [b / conv_output_scale for b in bias_scale]
 
             conv_output = super().call_operator(
                 exir_ops.backend.tosa.RESCALE.default,
-                (convolution, torch.int32, [conv_rescale_factor], 0, 0),
-                {},
-                new_meta,
-            )
-
-            bias_rescaled = super().call_operator(
-                exir_ops.backend.tosa.RESCALE.default,
-                (channel_bias, torch.int32, [bias_rescale_factor], 0, 0),
+                (convolution, torch.int32, conv_rescale_factors, 0, 0),
                 {},
                 new_meta,
             )
 
             add = super().call_operator(
                 exir_ops.edge.aten.add.Tensor,
-                (conv_output, bias_rescaled),
+                (conv_output, channel_bias),
                 {},
                 new_meta,
             )
@@ -128,7 +108,7 @@ def call_operator(self, op, args, kwargs, meta):
                 (
                     add,
                     output_dtype,
-                    [(common_scale / (conv_output_scale * (1 << bits_left_to_shift)))],
+                    final_output_scale,
                     0,
                     0,
                 ),
 
@@ -0,0 +1,66 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_div_tensor_mode import (
+    DecomposeDivTensorModePass,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass
+from torch._ops import OpOverload
+
+Op = OpOverload | EdgeOpOverload
+
+
+def _get_remainder_decomposition_ops(op: Op) -> tuple[Op, Op, Op]:
+    """
+    Returns the (div_mode_op, mul_op, sub_op) needed to lower the provided
+    remainder operator. The concrete ops depend on whether the remainder op is
+    the aten or edge variant.
+    """
+    if op == exir_ops.edge.aten.remainder.Tensor:
+        return (
+            exir_ops.edge.aten.div.Tensor_mode,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
+        )
+    if op == torch.ops.aten.remainder.Tensor:
+        return (
+            torch.ops.aten.div.Tensor_mode,
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.sub.Tensor,
+        )
+    raise RuntimeError(f"Can't get remainder decomposition ops for op {op}")
+
+
+class DecomposeRemainderPass(ArmPass):
+    """
+    Decompose the remainder operation into primitive arithmetic:
+        remainder(x, y) -> x - floor_div(x, y) * y
+    where floor_div(x, y) == div(x, y, rounding_mode=\"floor\").
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivTensorModePass}
+
+    def call_operator(self, op, args, kwargs, meta, updated=False):
+        supported_ops = (
+            exir_ops.edge.aten.remainder.Tensor,
+            torch.ops.aten.remainder.Tensor,
+        )
+        if op not in supported_ops:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        div_op, mul_op, sub_op = _get_remainder_decomposition_ops(op)
+        x, y = args[0], args[1]
+
+        floor_div = super().call_operator(
+            div_op, (x, y), {"rounding_mode": "floor"}, meta, updated=True
+        )
+        product = super().call_operator(mul_op, (floor_div, y), {}, meta, updated=True)
+        return super().call_operator(sub_op, (x, product), {}, meta, updated=True)
@@ -68,8 +68,8 @@ def call_operator(self, op, args, kwargs, meta):
             case _:
                 raise ValueError(f"Invalid number of arguments ({len(args)}) provided.")
 
-        # If dims is None, sum over all dimensions
-        if dims is None:
+        # If dims evaluates to False (None or []), sum over all dimensions
+        if not dims:
             shape = input_node.data.size()
             dims = list(range(len(shape)))