pytorch
diff --git a/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llama.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/add_bias_pass.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/add_bias_pass.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 8 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 145 additions & 0 deletions b/‎backends/arm/_passes/decompose_int16_activation_conv2d_pass.py‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operator_support/ethos_u55_support.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operator_support/ethos_u55_support.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operator_support/index_tensor_support.py‎
Lines changed: 17 additions & 1 deletion b/‎backends/arm/operator_support/index_tensor_support.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎backends/arm/operator_support/tosa_profile_supported_op_lists.py‎
Lines changed: 1 addition & 2 deletions b/‎backends/arm/operator_support/tosa_profile_supported_op_lists.py‎
Lines changed: 1 addition & 2 deletions
@@ -159,6 +159,7 @@ cmake_install_executorch_libraries() {
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT"
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
 
@@ -46,6 +46,9 @@
 from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
+from .decompose_int16_activation_conv2d_pass import (  # noqa
+    DecomposeConv2dWithInt16ActivationPass,
+)
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 
@@ -8,6 +8,7 @@
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import create_constant_placeholder
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -59,6 +60,10 @@ def call(self, graph_module):
                         persistent_buffer=True,
                         name=f"{node.name}_bias",
                     )
+                    if node.args[0].meta["val"].dtype == torch.int16:
+                        bias_node.meta[TosaSpecialDtype.meta_key()] = (
+                            TosaSpecialDtype.INT48
+                        )
                 node.update_arg(2, bias_node)
 
         if modified:
 
@@ -42,6 +42,7 @@
     DecomposeAtanPass,
     DecomposeAvgPool2d,
     DecomposeBatchNormNoStatsPass,
+    DecomposeConv2dWithInt16ActivationPass,
     DecomposeCoshPass,
     DecomposeCosineSimilarityPass,
     DecomposeCumsumPass,
@@ -183,6 +184,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(DecomposeGroupedConv())
+
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
@@ -196,9 +198,14 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
 
         self.add_pass(FuseViewCopyTransform())
         self.add_pass(FuseConstantArgsPass(exported_program))
+        self.add_pass(InsertTableOpsPass(exported_program))
+        # If we have a conv2d with int16 activation split up into a convolution
+        # and an addition, to work-around the lack of support for int48 in torch
+        # needs to happen before AddBiasPass, but after the table ops are inserted
+        # to be able to validate that conv2d has right dtype arguments.
+        self.add_pass(DecomposeConv2dWithInt16ActivationPass())
         self.add_pass(AddBiasPass(exported_program))
 
-        self.add_pass(InsertTableOpsPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
 
@@ -0,0 +1,145 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast
+
+import torch
+from executorch.backends.arm._passes.quant_args import QuantArgs
+
+from executorch.backends.arm.tosa.specification import get_context_spec, Tosa_1_00
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeConv2dWithInt16ActivationPass(ExportPass):
+    """
+    This pass decomposes a convolution with input dtype int16 and bias
+    into a convolution without bias followed by an addition of the bias
+    since the TOSA op requires the bias to be int48 which is hard to represent
+    in torch. Instead rescale the int48 output to int16 and add the bias in int16.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != exir_ops.edge.aten.convolution.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        tosa_spec = get_context_spec()
+        if not tosa_spec.support_integer():
+            return super().call_operator(op, args, kwargs, meta)
+
+        # return if no bias
+        if args[2] is None:
+            return super().call_operator(op, args, kwargs, meta)
+
+        if args[0].data.dtype == torch.int8:
+            return super().call_operator(op, args, kwargs, meta)
+        elif args[0].data.dtype == torch.int16:
+            if isinstance(tosa_spec, Tosa_1_00) and not tosa_spec.support_extension(
+                "int16"
+            ):
+                raise ValueError(
+                    "int16 activation for convolution requires TOSA int16 extension"
+                )
+        else:
+            raise NotImplementedError(
+                "Decomposition to conv+add only implemented for activation of int16 type"
+            )
+
+        # convolution with bias and activation is int16
+        # The bias is assumed to be quantized with the same quantization parameters as
+        # as the output of the convolution
+        bias = args[2]
+        assert (
+            meta.data["output_qparams"][0].dtype == bias.data.dtype
+        ), "Bias needs to have same type as quantized output type"
+        no_bias_args = list(args)
+        no_bias_args[2] = None
+        # split up to convolution + bias
+        convolution = super().call_operator(op, tuple(no_bias_args), kwargs, meta)
+
+        # create a copy of the meta without the qparams, to be used with the new nodes
+        new_meta = meta.copy()
+        new_meta.data.pop("output_qparams", None)
+        new_meta.data.pop("input_qparams", None)
+
+        # reshape the tensor to the same rank as the convolution output to add the bias to the channels
+        channel_bias = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (bias, [1, len(bias.data), 1, 1]),
+            {},
+            new_meta,
+        )
+
+        output_dtype = meta.data["output_qparams"][0].dtype
+
+        if output_dtype == torch.int16:
+            # The conv will get the output int48 scaled to int32 in serialization step.
+            # To be able to add the bias we need to first scale (cast?) the output to int32.
+            # The resulting i32 sum will then need to be scaled back to the output dtype.
+
+            # calculate common rescale factor from convolution output and bias quantization
+            output_qparams = cast(QuantArgs, meta.data["output_qparams"][0])
+            conv_output_scale = output_qparams.scale
+            bias_qparams = cast(QuantArgs, meta.data["input_qparams"][2])
+            bias_scale = bias_qparams.scale
+
+            common_scale = max(bias_scale, conv_output_scale)
+
+            # calculate how we can rescale bias and conv to a common scale and maximize the output range
+            bias_rescale_factor = bias_scale / common_scale
+            conv_rescale_factor = conv_output_scale / common_scale
+
+            # Either of conv output or bias now covers the full int16 range and the other one a smaller range.
+            # Since we are upscaling to int32 we have 16 additional bits to work with to maximize the output range.
+            # Worst case here is that both bias and conv output covers the full int16 range so we leave one bit
+            # and then one for the sign bit.
+            bits_left_to_shift = 14
+
+            # update rescale factors
+            bias_rescale_factor *= 1 << bits_left_to_shift
+            conv_rescale_factor *= 1 << bits_left_to_shift
+
+            conv_output = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (convolution, torch.int32, conv_rescale_factor, 0, 0),
+                {},
+                new_meta,
+            )
+
+            bias_rescaled = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (channel_bias, torch.int32, bias_rescale_factor, 0, 0),
+                {},
+                new_meta,
+            )
+
+            add = super().call_operator(
+                exir_ops.edge.aten.add.Tensor,
+                (conv_output, bias_rescaled),
+                {},
+                new_meta,
+            )
+
+            res_rescale = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (
+                    add,
+                    output_dtype,
+                    (common_scale / (conv_output_scale * (1 << bits_left_to_shift))),
+                    0,
+                    0,
+                ),
+                {},
+                new_meta,
+            )
+
+        else:
+            raise NotImplementedError(
+                f"Decomposition to conv+add only implemented for activation of int16 type, not for {output_dtype}"
+            )
+
+        return res_rescale
@@ -8,11 +8,13 @@
 from typing import Set, Type
 
 import torch
+
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_constant_placeholder_kind,
     get_param_tensor,
     is_param_node,
 )
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     delete_constant_placeholder,
@@ -47,9 +49,14 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 continue
             # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
             # Ensure tensor is on CPU and contiguous
+
+            # ensure we don't merge any special case int48_t tensors with int32_t tensors
+            # since int48_t tensors needs to be instantiated separately.
+            is_int48 = node.meta.get(TosaSpecialDtype.meta_key(), None)
             t_cpu = tensor.detach().cpu().contiguous()
             data_bytes = t_cpu.numpy().tobytes()
             key = (
+                is_int48,
                 str(t_cpu.dtype),
                 tuple(t_cpu.shape),
                 hashlib.sha1(data_bytes).hexdigest(),
 
@@ -19,4 +19,5 @@
     slice_copy_support,
     to_dim_order_copy_support,
     tosa_supported_operators,
+    where_support,
 )
@@ -128,7 +128,7 @@ class EthosU55NotSupported(OperatorSupportBase):
         exir_ops.edge.aten.bitwise_and.Scalar,
         exir_ops.edge.aten.bitwise_or.Scalar,
         exir_ops.edge.aten.bitwise_xor.Scalar,
-        exir_ops.edge.aten.bitwise_not,
+        exir_ops.edge.aten.bitwise_not.default,
         exir_ops.edge.aten.logical_and.default,
         exir_ops.edge.aten.logical_or.default,
         exir_ops.edge.aten.logical_xor.default,
 
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide TOSA support checks for ``aten.index.Tensor``.
+
+Reject unsupported patterns such as high-rank index tensors, front-positioned
+slice/ellipsis/None markers, and cases that exceed ``int32`` element limits.
+
+"""
 
 import math
 
@@ -18,7 +24,8 @@
 
 @register_tosa_support_check
 class IndexTensorSupported(SupportedTOSAOperatorCheck):
-    """
+    """Prevent partitioning of unsupported ``index.Tensor`` usages.
+
     This support check is intended to prevent the partitioning of
     currently unsupported usages of the index.Tensor operator.
 
@@ -95,6 +102,7 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck):
             t[1:3, torch.arange(5), 2:3, torch.arange(3).reshape(3,1)]
         are also possible and can result in some unintuitive behaviors
         where batching and indexing are mixed together.
+
     """
 
     targets = [exir_ops.edge.aten.index.Tensor]
@@ -107,6 +115,14 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck):
     def is_node_tosa_supported(
         self, node: fx.Node, tosa_spec: TosaSpecification
     ) -> bool:  # type: ignore[override, misc]
+        """Return True if ``aten.index.Tensor`` usage fits supported patterns.
+
+        Enforces the following constraints:
+        - No ``None`` (unsqueeze), slice, or ellipsis before an indexing tensor.
+        - Indexing tensors have rank <= 3.
+        - The value tensor element count fits in ``int32``.
+
+        """
         indices = node.args[1]
         for index in indices:  # type: ignore[union-attr]
             # Usage 2 guard
 
@@ -104,7 +104,6 @@
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.pow.Tensor_Scalar,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-    exir_ops.edge.aten.where.self,
     operator.getitem,
     exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
     exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -136,6 +135,7 @@
     exir_ops.edge.aten.logit.default,
     exir_ops.edge.aten.acos.default,
     exir_ops.edge.aten.elu.default,
+    exir_ops.edge.aten.bitwise_not.default,
 }
 
 
@@ -220,7 +220,6 @@
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.pow.Tensor_Scalar,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-    exir_ops.edge.aten.where.self,
     operator.getitem,
     exir_ops.edge.aten.constant_pad_nd.default,
     exir_ops.edge.aten.amax.default,
Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ cmake_install_executorch_libraries() {`
`159`	`159`	`-DCMAKE_INSTALL_PREFIX=cmake-out \`
`160`	`160`	`-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \`
`161`	`161`	`-DEXECUTORCH_BUILD_QNN="$QNN" \`
	`162`	`+ -DEXECUTORCH_ENABLE_LOGGING=ON \`
`162`	`163`	`-DQNN_SDK_ROOT="$QNN_SDK_ROOT"`
`163`	`164`	`cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"`
`164`	`165`	`}`
Original file line number	Diff line number	Diff line change
`@@ -19,4 +19,5 @@`
`19`	`19`	`slice_copy_support,`
`20`	`20`	`to_dim_order_copy_support,`
`21`	`21`	`tosa_supported_operators,`
	`22`	`+ where_support,`
`22`	`23`	`)`