pytorch
diff --git a/‎backends/cadence/hifi/operators/op_quantized_linear_out.cpp‎
Lines changed: 33 additions & 2 deletions b/‎backends/cadence/hifi/operators/op_quantized_linear_out.cpp‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/targets.bzl‎
Lines changed: 4 additions & 1 deletion b/‎backends/cadence/hifi/operators/targets.bzl‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/cadence/hifi/operators/tests/test_op_quantized_linear_out.cpp‎
Lines changed: 132 additions & 0 deletions b/‎backends/cadence/hifi/operators/tests/test_op_quantized_linear_out.cpp‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎backends/nxp/backend/edge_helper.py‎
Lines changed: 11 additions & 0 deletions b/‎backends/nxp/backend/edge_helper.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py‎
Lines changed: 111 additions & 0 deletions b/‎backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎backends/nxp/tests/executorch_pipeline.py‎
Lines changed: 7 additions & 2 deletions b/‎backends/nxp/tests/executorch_pipeline.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py‎
Lines changed: 5 additions & 5 deletions b/‎backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py‎
Lines changed: 5 additions & 5 deletions
@@ -9,6 +9,7 @@
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/backends/cadence/hifi/operators/operators.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/op_quantized_linear.h>
 #include <xa_nnlib_kernels_api.h>
 #include <xtensa/tie/xt_datacache.h>
 #include <algorithm>
@@ -218,7 +219,22 @@ void quantized_linear_out(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
+  if (out.scalar_type() == ::executorch::aten::ScalarType::Short &&
+      in.scalar_type() == ::executorch::aten::ScalarType::Short &&
+      weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
+    ::impl::generic::native::quantized_linear_out(
+        ctx,
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _quantized_linear_asym8u(
         in,
         weight,
@@ -260,7 +276,22 @@ void quantized_linear_per_tensor_out(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
+  if (out.scalar_type() == ::executorch::aten::ScalarType::Short &&
+      in.scalar_type() == ::executorch::aten::ScalarType::Short &&
+      weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
+    ::impl::generic::native::quantized_linear_per_tensor_out(
+        ctx,
+        in,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        out);
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _quantized_linear_per_tensor_asym8u(
         in,
         weight,
 
@@ -87,7 +87,6 @@ OPERATORS = [
     "quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out",
     "quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out",
     "quantized_layer_norm",
-    "quantized_linear_out",
     "quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
     "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
     "quantized_matmul_out",
@@ -122,3 +121,7 @@ def define_common_targets():
     # Define build targets for all operators registered in the tables above.
     for op in OPERATORS:
         define_operator(op)
+
+    # quantized_linear_out and quantized_linear_per_tensor_out needs additional dependency for int16 support
+    define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"])
+    define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"])
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+#include <sys/times.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::runtime::runtime_init;
+using ::executorch::runtime::testing::TensorFactory;
+using std::optional;
+using std::string_view;
+
+class HiFiQuantizedLinearTest : public OperatorTest {
+ public:
+ protected:
+  void quantized_linear_out(
+      const Tensor& input,
+      const Tensor& weight,
+      const Tensor& bias,
+      int64_t in_zero_point,
+      const Tensor& weight_zero_point,
+      const Tensor& out_multiplier,
+      const Tensor& out_shift,
+      int64_t out_zero_point,
+      const optional<Tensor>& offset,
+      Tensor& output) {
+    return ::impl::HiFi::native::quantized_linear_out(
+        context_,
+        input,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        output);
+  }
+
+  void quantized_linear_per_tensor_out(
+      const Tensor& input,
+      const Tensor& weight,
+      const Tensor& bias,
+      int64_t in_zero_point,
+      int64_t weight_zero_point,
+      int64_t out_multiplier,
+      int64_t out_shift,
+      int64_t out_zero_point,
+      const optional<Tensor>& offset,
+      Tensor& output) {
+    return ::impl::HiFi::native::quantized_linear_per_tensor_out(
+        context_,
+        input,
+        weight,
+        bias,
+        in_zero_point,
+        weight_zero_point,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        offset,
+        output);
+  }
+};
+
+// Test quantized_linear_out with int16 activations (asym8s)
+TEST_F(HiFiQuantizedLinearTest, QuantizedLinearInt16Test) {
+  TensorFactory<ScalarType::Short> tf_int16;
+  TensorFactory<ScalarType::Int> tf_int32;
+  TensorFactory<ScalarType::Char> tf_int8;
+
+  // Simple 2D case: input [2, 3] x weight [4, 3] = output [2, 4]
+  // Values captured from e2e test with
+  // CadenceWith16BitLinearActivationsQuantizer
+  Tensor input =
+      tf_int16.make({2, 3}, {-28170, -26389, -32768, -31474, -32266, -29076});
+  Tensor weight = tf_int8.make(
+      {4, 3}, {1, 87, -128, -114, -59, 44, -1, 127, -12, 44, -46, -29});
+  Tensor bias = tf_int32.zeros({4});
+  Tensor output = tf_int16.zeros({2, 4});
+
+  int64_t in_zero_point = -29822;
+  Tensor weight_zero_point = tf_int32.make({1}, {2});
+  Tensor out_multiplier = tf_int32.make({1}, {2011373824});
+  Tensor out_shift = tf_int32.make({1}, {-8});
+  int64_t out_zero_point = -30847;
+  quantized_linear_out(
+      input,
+      weight,
+      bias,
+      in_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      std::nullopt,
+      output);
+  // Expected output from e2e test
+  Tensor expected_output = tf_int16.make(
+      {2, 4}, {-28384, -32767, -29144, -30862, -31956, -29486, -31985, -30756});
+  EXPECT_TENSOR_CLOSE(output, expected_output);
+}
+
+} // namespace
+} // namespace native
+} // namespace HiFi
+} // namespace impl
@@ -125,3 +125,14 @@ def previous_non_qdq_node(node: Node, input_index: int = 0) -> Node | None:
             current_node = current_node.args[0]
         else:
             return current_node
+
+
+Scale = list[float] | float
+ZeroPoint = list[int] | int
+
+
+def get_quantization_parameters_for(node: Node) -> tuple[Scale, ZeroPoint] | None:
+    if "quantize" not in node.target.__name__ or len(node.args) < 3:
+        return None
+
+    return node.args[1], node.args[2]  # Scale and zero_point
@@ -0,0 +1,111 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_helper import get_quantization_parameters_for
+from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
+from executorch.backends.nxp.neutron_partitioner import QDQClusterRecognizer
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class RemoveAdditionalQDQClustersPass(NeutronEdgePass):
+    """
+    After delegation of partitions, there may be additional dequantize quantize nodes for QDQ clusters that were
+    not delegated. If dequantize quantize nodes are quantized per tensor and quantization parameters of dequantize
+    and quantize nodes in a QDQ cluster are equal, the nodes can be removed and thus the inner nodes computed in int8.
+
+                                         │
+                            ┌────────────▼──────────┐
+                            │ dequantize_per_tensor │
+                            └────────────┬──────────┘
+                                         │                                    │
+                                     ┌───▼──┐        replace with         ┌───▼──┐
+                                     │ node │       ──────────────►       │ node │
+                                     └───┬──┘                             └───┬──┘
+                                         │                                    ▼
+                             ┌───────────▼─────────┐
+                             │ quantize_per_tensor │
+                             └───────────┬─────────┘
+                                         ▼
+
+    """
+
+    qdq_per_channel_nodes = (
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    )
+
+    qdq_per_tensor_nodes = (
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    )
+
+    def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        nodes = list(graph_module.graph.nodes)
+        qdq_clusterer = QDQClusterRecognizer()
+        qdq_clusterer.tag_qdq_clusters(nodes)
+
+        for cluster in qdq_clusterer.cluster_map.values():
+            # For now, enable only permute_copy and cat.
+            if cluster.compute_node.target not in [
+                exir_ops.edge.aten.permute_copy.default,
+                exir_ops.edge.aten.cat.default,
+            ]:
+                continue
+
+            # Ensure cluster doesn't contain dequantize/quantize per channel nodes.
+            if any(
+                node
+                for node in cluster.ops
+                if node.target in self.qdq_per_channel_nodes
+            ):
+                continue
+
+            qdq_nodes = [
+                node for node in cluster.ops if node.target in self.qdq_per_tensor_nodes
+            ]
+
+            qdq_nodes_quant_params = [
+                get_quantization_parameters_for(node) for node in qdq_nodes
+            ]
+
+            equal_quant_scales = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][0], qdq_nodes_quant_params[idx + 1][0]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            equal_quant_zero_points = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][1], qdq_nodes_quant_params[idx + 1][1]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            # Check if all quantization params are equal to ensure that QDQ cluster can be removed.
+            if not all(equal_quant_scales + equal_quant_zero_points):
+                continue
+
+            # Replace the uses of each dequantize/quantize node with its arg node.
+            for qdq_node in qdq_nodes:
+                qdq_node.replace_all_uses_with(qdq_node.args[0])
+                graph_module.graph.erase_node(qdq_node)
+
+            # Remove compute node cluster info from node meta.
+            cluster.compute_node.meta.pop("cluster")
+
+            graph_module = self.recompile_module(graph_module)
+
+            # The graph has now changed, and we cannot keep iterating through it. Return the new graph and the parent
+            #  class will call this pass again.
+            return PassResult(graph_module, True)
+
+        return PassResult(graph_module, False)
@@ -17,6 +17,9 @@
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
 from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
     RemoveIOQuantOpsPass,
 )
@@ -35,7 +38,6 @@
 from torch.export import export
 from torchao.quantization.pt2e.quantizer import Quantizer
 
-
 neutron_converter_flavor = "SDK_25_09"
 neutron_target_spec = NeutronTargetSpec(
     target="imxrt700", neutron_converter_flavor=neutron_converter_flavor
@@ -64,7 +66,6 @@ def _get_default_quantizer(target_spec: NeutronTargetSpec) -> Quantizer:
 def to_model_input_spec(
     input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]]
 ) -> tuple[ModelInputSpec, ...]:
-
     if isinstance(input_spec, tuple) and all(
         isinstance(spec, ModelInputSpec) for spec in input_spec
     ):
@@ -139,6 +140,10 @@ def to_quantized_edge_program(
             [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]
         )
 
+    edge_program_manager = edge_program_manager.transform(
+        NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])
+    )
+
     return edge_program_manager
 
 
 
@@ -104,7 +104,7 @@ def forward(self, x):
         return torch.permute(x, self.perm)
 
 
-class TestPermuteCopyConversion(kgb.SpyAgency, unittest.TestCase):
+class TestPermuteCopyConversion(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         torch.manual_seed(23)
@@ -302,9 +302,9 @@ def test_permute_copy_non_delegated_conversion__from_permute_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.
 
     @parameterized.expand(
@@ -320,7 +320,7 @@ def test_permute_copy_non_delegated_conversion__from_transpose_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.