NXP backend: Add RemoveAdditionalQDQClustersPass.

MartinPavella · MartinPavella · commit aa651f138b7a · 2025-10-15T13:50:58.000+02:00
diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py
@@ -87,3 +87,14 @@ def try_get_tensor_constant_from_node(
             return None
         attr_itr = getattr(attr_itr, atom)
     return attr_itr
+
+
+Scale = list[float] | float
+ZeroPoint = list[int] | int
+
+
+def get_quantization_parameters_for(node: Node) -> tuple[Scale, ZeroPoint] | None:
+    if "quantize" not in node.target.__name__ or len(node.args) < 3:
+        return None
+
+    return node.args[1], node.args[2]  # Scale and zero_point
diff --git a/backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py b/backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py
@@ -0,0 +1,111 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_helper import get_quantization_parameters_for
+from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
+from executorch.backends.nxp.neutron_partitioner import QDQClusterRecognizer
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class RemoveAdditionalQDQClustersPass(NeutronEdgePass):
+    """
+    After delegation of partitions, there may be additional dequantize quantize nodes for QDQ clusters that were
+    not delegated. If dequantize quantize nodes are quantized per tensor and quantization parameters of dequantize
+    and quantize nodes in a QDQ cluster are equal, the nodes can be removed and thus the inner nodes computed in int8.
+
+                                         │
+                            ┌────────────▼──────────┐
+                            │ dequantize_per_tensor │
+                            └────────────┬──────────┘
+                                         │                                    │
+                                     ┌───▼──┐        replace with         ┌───▼──┐
+                                     │ node │       ──────────────►       │ node │
+                                     └───┬──┘                             └───┬──┘
+                                         │                                    ▼
+                             ┌───────────▼─────────┐
+                             │ quantize_per_tensor │
+                             └───────────┬─────────┘
+                                         ▼
+
+    """
+
+    qdq_per_channel_nodes = (
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    )
+
+    qdq_per_tensor_nodes = (
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    )
+
+    def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        nodes = list(graph_module.graph.nodes)
+        qdq_clusterer = QDQClusterRecognizer()
+        qdq_clusterer.tag_qdq_clusters(nodes)
+
+        for cluster in qdq_clusterer.cluster_map.values():
+            # For now, enable only permute_copy and cat.
+            if cluster.compute_node.target not in [
+                exir_ops.edge.aten.permute_copy.default,
+                exir_ops.edge.aten.cat.default,
+            ]:
+                continue
+
+            # Ensure cluster doesn't contain dequantize/quantize per channel nodes.
+            if any(
+                node
+                for node in cluster.ops
+                if node.target in self.qdq_per_channel_nodes
+            ):
+                continue
+
+            qdq_nodes = [
+                node for node in cluster.ops if node.target in self.qdq_per_tensor_nodes
+            ]
+
+            qdq_nodes_quant_params = [
+                get_quantization_parameters_for(node) for node in qdq_nodes
+            ]
+
+            equal_quant_scales = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][0], qdq_nodes_quant_params[idx + 1][0]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            equal_quant_zero_points = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][1], qdq_nodes_quant_params[idx + 1][1]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            # Check if all quantization params are equal to ensure that QDQ cluster can be removed.
+            if not all(equal_quant_scales + equal_quant_zero_points):
+                continue
+
+            # Replace the uses of each dequantize/quantize node with its arg node.
+            for qdq_node in qdq_nodes:
+                qdq_node.replace_all_uses_with(qdq_node.args[0])
+                graph_module.graph.erase_node(qdq_node)
+
+            # Remove compute node cluster info from node meta.
+            cluster.compute_node.meta.pop("cluster")
+
+            graph_module = self.recompile_module(graph_module)
+
+            # The graph has now changed, and we cannot keep iterating through it. Return the new graph and the parent
+            #  class will call this pass again.
+            return PassResult(graph_module, True)
+
+        return PassResult(graph_module, False)
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -15,6 +15,9 @@
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
 from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
@@ -58,7 +61,6 @@ def get_random_calibration_inputs(
 def to_model_input_spec(
     input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]]
 ) -> tuple[ModelInputSpec, ...]:
-
     if isinstance(input_spec, tuple) and all(
         isinstance(spec, ModelInputSpec) for spec in input_spec
     ):
@@ -126,6 +128,10 @@ def to_quantized_edge_program(
     partitioner = NeutronPartitioner(compile_spec, custom_delegation_options)
     edge_program_manager = edge_program_manager.to_backend(partitioner)
 
+    edge_program_manager = NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])(
+        edge_program_manager
+    )
+
     return edge_program_manager
 
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
@@ -104,7 +104,7 @@ def forward(self, x):
         return torch.permute(x, self.perm)
 
 
-class TestPermuteCopyConversion(kgb.SpyAgency, unittest.TestCase):
+class TestPermuteCopyConversion(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         torch.manual_seed(23)
@@ -302,9 +302,9 @@ def test_permute_copy_non_delegated_conversion__from_permute_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.
 
     @parameterized.expand(
@@ -320,7 +320,7 @@ def test_permute_copy_non_delegated_conversion__from_transpose_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.
diff --git a/backends/nxp/tests/test_edge_passes.py b/backends/nxp/tests/test_edge_passes.py
@@ -1,14 +1,49 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
 import numpy as np
+import torch
+
+from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import (
+    NeutronAtenPassManager,
+)
+from executorch.backends.nxp.backend.custom_delegation_options import (
+    CustomDelegationOptions,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import (
     ViewCopyConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
+    NeutronEdgePassManager,
+)
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
+from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
+from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    _quantize_model,
+    get_random_calibration_inputs,
+    to_model_input_spec,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import (
+    compare_output_arrays,
     EdgeProgramExecutor,
     OverrideTargetSupportCheck,
 )
+from executorch.backends.nxp.tests.ir.converter.node_converter.test_permute_copy_converter import (
+    Conv2dPermuteModule,
+)
 from executorch.backends.nxp.tests.models import ConvFCFCSoftmaxModuleWithoutReshape
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.extension.export_util.utils import export_to_edge
 from torch.fx import Graph, Node
 
 
@@ -57,18 +92,26 @@ def _assert_nodes_form_a_view_copy_qdq_cluster(graph: Graph, node_indices: list[
     assert quantize.args[0] == view_copy
 
 
-def test_moving_view_copy_into_separate_qdq_clusters():
-    model = ConvFCFCSoftmaxModuleWithoutReshape()
-    input_shape = (1, 4, 3, 33)
+class TestEdgePasses(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
 
-    # Prohibit `view_copy` conversion for the testing purposes.
-    def unsupported_target(*_):
-        return False
+    def test_moving_view_copy_into_separate_qdq_clusters(self):
+        model = ConvFCFCSoftmaxModuleWithoutReshape()
+        input_shape = (1, 4, 3, 33)
+
+        # Prohibit `view_copy` conversion for the testing purposes.
+        def unsupported_target(*_):
+            return False
+
+        # Prohibit `view_copy` conversion for the testing purposes.
+        with OverrideTargetSupportCheck(
+            ViewCopyConverter, new_target_support_check=unsupported_target
+        ):
+            epm = to_quantized_edge_program(model, input_shape, target="imxrt700")
 
-    with OverrideTargetSupportCheck(
-        ViewCopyConverter, new_target_support_check=unsupported_target
-    ):
-        epm = to_quantized_edge_program(model, input_shape, target="imxrt700")
         exported_program = epm.exported_program()
 
         nodes = list(exported_program.graph_module.graph.nodes)
@@ -86,3 +129,90 @@ def unsupported_target(*_):
         input_data = np.random.random(input_shape).astype("float32")
         program_executor = EdgeProgramExecutor(exported_program)
         program_executor.inference(input_data)
+
+    def test_remove_additional_quantize_dequantize_nodes_pass(self):
+        input_shape = (1, 3, 8, 16)
+        new_dims = (3, 2, 1, 0)
+        model = Conv2dPermuteModule(input_shape[1], new_dims)
+        target = "imxrt700"
+        custom_delegation_options = CustomDelegationOptions()
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+
+        example_input = calibration_inputs[0]
+        exir_program_aten = torch.export.export(model, example_input).module()
+
+        # Run pre-processing passes of the float32 aten dialect program.
+        exir_program_aten = NeutronAtenPassManager()(exir_program_aten).graph_module
+
+        exir_program_aten_quant = _quantize_model(
+            exir_program_aten, NeutronQuantizer(), calibration_inputs
+        )
+        edge_program_manager = export_to_edge(
+            exir_program_aten_quant,
+            example_input,
+        )
+
+        edge_program_manager = NeutronEdgePassManager()(edge_program_manager)
+
+        compile_spec = generate_neutron_compile_spec(target, "SDK_25_09")
+        partitioner = NeutronPartitioner(compile_spec, custom_delegation_options)
+
+        edge_program_manager = edge_program_manager.to_backend(partitioner)
+
+        # Make sure QDQ cluster for permute_copy is present.
+        edge_program_with_qdq_cluster = copy.deepcopy(
+            edge_program_manager.exported_program()
+        )
+        nodes = list(edge_program_with_qdq_cluster.graph.nodes)
+        assert len(nodes) == 10
+        assert (
+            nodes[5].target
+            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+        )
+        assert nodes[6].target == exir_ops.edge.aten.permute_copy.default
+        assert "cluster" in nodes[6].meta
+        assert (
+            nodes[7].target
+            == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+        )
+
+        # Run pass for removal of additional QDQ nodes and compute in non-float types where possible
+        edge_program_manager = NeutronEdgePassManager(
+            [RemoveAdditionalQDQClustersPass()]
+        )(edge_program_manager)
+
+        # Make sure QDQ cluster for permute_copy is removed.
+        edge_program_without_qdq_cluster = edge_program_manager.exported_program()
+        nodes = list(edge_program_without_qdq_cluster.graph.nodes)
+        assert len(nodes) == 8
+        assert nodes[4].name == "getitem"
+        assert nodes[5].target == exir_ops.edge.aten.permute_copy.default
+        assert "cluster" not in nodes[5].meta
+        assert (
+            nodes[6].target
+            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+        )
+
+        edge_program_executor_without_qdq_cluster = EdgeProgramExecutor(
+            edge_program_without_qdq_cluster
+        )
+        edge_program_executor_with_qdq_cluster = EdgeProgramExecutor(
+            edge_program_with_qdq_cluster
+        )
+
+        input_data = np.random.random(input_shape).astype(np.float32)
+        edge_program_output_without_qdq_cluster = (
+            edge_program_executor_without_qdq_cluster.inference(input_data)
+        )
+        edge_program_output_with_qdq_cluster = (
+            edge_program_executor_with_qdq_cluster.inference(input_data)
+        )
+
+        compare_output_arrays(
+            edge_program_output_without_qdq_cluster,
+            edge_program_output_with_qdq_cluster,
+            "main output",
+        )
diff --git a/backends/nxp/tests/test_turning_batch_first_gru_to_time_major.py b/backends/nxp/tests/test_turning_batch_first_gru_to_time_major.py
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
@@ -11,13 +11,15 @@
 from collections import defaultdict
 from typing import Iterator
 
-import executorch.extension.pybindings.portable_lib
 import executorch.kernels.quantized  # noqa F401
 
 import torch
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
 from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
@@ -272,6 +274,10 @@ def _get_batch_size(data):
         remove_io_quant_ops=args.remove_quant_io_ops
     )(edge_program_manager)
 
+    edge_program_manager = NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])(
+        edge_program_manager
+    )
+
     logging.debug(f"Lowered graph:\n{edge_program_manager.exported_program().graph}")
 
     # 5. Export to ExecuTorch program