NXP backend: Add support for per-channel quantization for Conv

skywall · StrycekSimon · commit 2c82054785d3 · 2025-09-23T09:34:22.000+02:00
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
@@ -134,6 +134,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
 
         qdq_related_functions = [
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
 
@@ -203,7 +204,8 @@ def _convert_qdq_cluster_q_dq_nodes(
         :param conversion_context: ConversionContext instance.
         """
         qdq_q_ops_converters = {
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter,  # noqa F405
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter,  # noqa F405
         }
 
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -41,7 +41,8 @@
     PermuteCopyConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_dequantize_converter import (
-    QDQDequantizeConverter,
+    QDQPerChannelDequantizeConverter,
+    QDQPerTensorDequantizeConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_quantize_converter import (
     QDQQuantizeConverter,
@@ -70,7 +71,8 @@
     "PermuteCopyConverter",
     "SoftmaxConverter",
     "ViewCopyConverter",
-    "QDQDequantizeConverter",
+    "QDQPerTensorDequantizeConverter",
+    "QDQPerChannelDequantizeConverter",
     "QDQQuantizeConverter",
     "ConstantPadNDConverter",
     "ReLUConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
 
 import numpy as np
 
@@ -19,15 +20,23 @@
 from torch.nn import Parameter
 
 
-class QDQDequantizeConverter(NodeConverter):
+class QDQDequantizeConverterBase(NodeConverter, ABC):
+
+    @abstractmethod
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        pass
+
+    @abstractmethod
+    def get_scale(self, node: Node) -> np.ndarray:
+        pass
 
     @staticmethod
     def _is_supported_in_IR(
         node: Node,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        zero_point_type = torch_type_to_numpy_type(node.args[5])
+        zero_point_type = torch_type_to_numpy_type(node.args[-1])
         if "cluster" not in node.meta or zero_point_type not in [np.int8, np.int32]:
             return False
 
@@ -39,10 +48,8 @@ def convert(self, node: Node):
         from_tensor = self.builder.tensor_for_name(node.name)
         to_tensor = self.builder.tensor_for_name(node.args[0].name)
 
-        zero_point_type = torch_type_to_numpy_type(node.args[5])
-
-        scale = np.array(node.args[1], dtype=np.float32)
-        zero_point = np.array(node.args[2], dtype=zero_point_type)
+        scale = self.get_scale(node)
+        zero_point = self.get_zero_point(node)
 
         if self.context.parameters_mapping.get(node.args[0].name, None) is None:
             # Convert dequantize as identity op (Transpose that will be removed) because
@@ -63,3 +70,22 @@ def convert(self, node: Node):
             # Change type so we pass check tensor similarity check when redirecting
             from_tensor.type = to_tensor.type
             self.builder.redirect_tensor(from_tensor, to_tensor)
+
+
+class QDQPerTensorDequantizeConverter(QDQDequantizeConverterBase):
+
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        zero_point_type = torch_type_to_numpy_type(node.args[5])
+        return np.array(node.args[2], dtype=zero_point_type)
+
+    def get_scale(self, node: Node) -> np.ndarray:
+        return np.array(node.args[1], dtype=np.float32)
+
+
+class QDQPerChannelDequantizeConverter(QDQDequantizeConverterBase):
+
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        return self.context.parameters_mapping[node.args[2].name].numpy()
+
+    def get_scale(self, node: Node) -> np.ndarray:
+        return self.context.parameters_mapping[node.args[1].name].numpy()
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
@@ -16,6 +16,7 @@
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
     FixedQParamsQuantizationSpec,
+    QuantizationSpec,
     SharedQuantizationSpec,
 )
 from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
@@ -54,7 +55,9 @@ class PartitionAnchors:
         tuple[fx.Node, NodeArgsIdx]
         | tuple[fx.Node, NodeArgsIdx, SharedQuantizationSpec],
     ] = field(default_factory=list)
-    weights: list[tuple[fx.Node, NodeArgsIdx]] = field(default_factory=list)
+    weights: list[
+        tuple[fx.Node, NodeArgsIdx] | tuple[fx.Node, NodeArgsIdx, QuantizationSpec],
+    ] = field(default_factory=list)
     biases: list[
         tuple[fx.Node, NodeArgsIdx]
         | tuple[fx.Node, NodeArgsIdx, DerivedQuantizationSpec],
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -38,9 +38,9 @@ class ModelInputSpec:
     dtype: torch.dtype = torch.float32
 
 
-def _quantize_model(model, calibration_inputs: list[tuple[torch.Tensor, ...]]):
-    quantizer = NeutronQuantizer()
-
+def _quantize_model(
+    model, quantizer, calibration_inputs: list[tuple[torch.Tensor, ...]]
+):
     m = prepare_pt2e(model, quantizer)
     for data in calibration_inputs:
         m(*data)
@@ -91,6 +91,7 @@ def to_quantized_edge_program(
     neutron_converter_flavor="SDK_25_06",
     remove_quant_io_ops=False,
     custom_delegation_options=CustomDelegationOptions(),  # noqa B008
+    get_quantizer_fn=lambda: NeutronQuantizer(),
 ) -> EdgeProgramManager:
     calibration_inputs = get_calibration_inputs_fn(to_model_input_spec(input_spec))
 
@@ -102,7 +103,9 @@ def to_quantized_edge_program(
     exir_program_aten = torch.export.export(model, example_input, strict=True)
 
     exir_program_aten__module_quant = _quantize_model(
-        exir_program_aten.module(), calibration_inputs
+        exir_program_aten.module(),
+        get_quantizer_fn(),
+        calibration_inputs,
     )
 
     edge_compile_config = EdgeCompileConfig(_check_ir_validity=False)
diff --git a/backends/nxp/tests/test_per_channel_conversion.py b/backends/nxp/tests/test_per_channel_conversion.py
@@ -0,0 +1,153 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import kgb
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.quantizer.neutron_quantizer import (
+    act_qspec,
+    NeutronAtenQuantizer,
+    wgt_qspec,
+)
+from executorch.backends.nxp.quantizer.patterns import (
+    NodeArgsIdx,
+    PartitionAnchors,
+    QuantizationPattern,
+)
+from executorch.backends.nxp.quantizer.utils import get_bias_qparams
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule
+from executorch.backends.nxp.tests.test_quantizer import _get_target_name
+
+from torch import fx
+from torch._ops import OpOverload
+from torch.export import ExportedProgram
+from torchao.quantization.pt2e import MinMaxObserver, PerChannelMinMaxObserver
+from torchao.quantization.pt2e.quantizer import (
+    DerivedQuantizationSpec,
+    QuantizationConfig,
+    QuantizationSpec,
+)
+
+
+class Conv2dPatternPerChannel(QuantizationPattern):
+
+    def __init__(self, is_per_channel: bool):
+        super().__init__()
+        self.is_per_channel = is_per_channel
+
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.conv2d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors:
+        conv2d_node = fused_partition[0].nodes[-1]
+
+        bias_qscheme = (
+            torch.per_channel_symmetric
+            if self.is_per_channel
+            else torch.per_tensor_symmetric
+        )
+        bias_quantization_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv2d_node.args[0], conv2d_node),
+                (conv2d_node.args[1], conv2d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31) + 1,
+            quant_max=2**31 - 1,
+            qscheme=bias_qscheme,
+            ch_axis=0,
+        )
+
+        weight_qscheme = (
+            torch.per_channel_symmetric
+            if self.is_per_channel
+            else torch.per_tensor_symmetric
+        )
+        weight_observer_or_fake_quant_ctr = (
+            PerChannelMinMaxObserver if self.is_per_channel else MinMaxObserver
+        )
+        weight_quantization_spec = QuantizationSpec(
+            dtype=torch.int8,
+            observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr,
+            quant_min=-127,
+            quant_max=127,
+            qscheme=weight_qscheme,
+            ch_axis=0,
+        )
+
+        return PartitionAnchors(
+            inputs=[(conv2d_node, NodeArgsIdx(0))],
+            weights=[(conv2d_node, NodeArgsIdx(1), weight_quantization_spec)],
+            biases=[(conv2d_node, NodeArgsIdx(2), bias_quantization_qspec)],
+            output=[(conv2d_node,)],
+        )
+
+
+class TestPerChannelConversion(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests
+
+    def test_per_channel_convolution(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            model = Conv2dModule(
+                in_channels=8, out_channels=32, kernel_size=5, padding=3
+            )
+            input_shape = (1, 8, 32, 32)
+
+            static_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_qspec, None)
+            _ = to_quantized_edge_program(
+                model,
+                input_shape,
+                get_quantizer_fn=lambda: NeutronAtenQuantizer(
+                    Conv2dPatternPerChannel(is_per_channel=True), static_qconfig
+                ),
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+
+            convert_run_compare(
+                exported_program,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tfl_model=tflite_flatbuffers_model,
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
+
+            nodes = list(exported_program.graph.nodes)
+
+            assert _get_target_name(nodes[8]).endswith(
+                "quantized_decomposed.dequantize_per_channel.default"
+            )
+            assert _get_target_name(nodes[9]).endswith(
+                "quantized_decomposed.dequantize_per_channel.default"
+            )
+            assert nodes[10].name == "aten_convolution_default"
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(25)
+        np.random.seed(25)
diff --git a/backends/nxp/tests/test_removing_dead_code.py b/backends/nxp/tests/test_removing_dead_code.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
 from executorch.backends.nxp.tests.executorch_pipeline import _quantize_model
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 
@@ -45,8 +46,9 @@ def test_removing_dead_code(self):
         )
 
         # The `NeutronQuantizer` should remove the dead code in the `transform_for_annotation()` method.
+        quantizer = NeutronQuantizer()
         exir_program_aten_quant = _quantize_model(
-            exir_program_aten.module(), [example_inputs]
+            exir_program_aten.module(), quantizer, [example_inputs]
         )
 
         # Make sure the is no `add` operation in the graph anymore.
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
@@ -17,6 +17,7 @@
 )
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
 from executorch.backends.nxp.tests.executorch_pipeline import (
     _quantize_model,
     get_random_calibration_inputs,
@@ -39,8 +40,11 @@ def _quantize_and_lower_module(
     module: GraphModule, input_shape: tuple[int, ...], target="imxrt700"
 ) -> EdgeProgramManager:
     calibration_inputs = get_random_calibration_inputs(to_model_input_spec(input_shape))
+    quantizer = NeutronQuantizer()
 
-    exir_program_aten__module_quant = _quantize_model(module, calibration_inputs)
+    exir_program_aten__module_quant = _quantize_model(
+        module, quantizer, calibration_inputs
+    )
 
     edge_compile_config = EdgeCompileConfig(_check_ir_validity=False)
     edge_program_manager = export_to_edge(

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex`
`134`	`134`
`135`	`135`	`qdq_related_functions = [`
`136`	`136`	`exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,`
	`137`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,`
`137`	`138`	`exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,`
`138`	`139`	`]`
`139`	`140`
`@@ -203,7 +204,8 @@ def _convert_qdq_cluster_q_dq_nodes(`
`203`	`204`	`:param conversion_context: ConversionContext instance.`
`204`	`205`	`"""`
`205`	`206`	`qdq_q_ops_converters = {`
`206`		`- exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter, # noqa F405`
	`207`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter, # noqa F405`
	`208`	`+ exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter, # noqa F405`
`207`	`209`	`exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter, # noqa F405`
`208`	`210`	`}`
`209`	`211`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`import pytest`
`10`	`10`	`import torch`
`11`	`11`
	`12`	`+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer`
`12`	`13`	`from executorch.backends.nxp.tests.executorch_pipeline import _quantize_model`
`13`	`14`	`from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops`
`14`	`15`
`@@ -45,8 +46,9 @@ def test_removing_dead_code(self):`
`45`	`46`	`)`
`46`	`47`
`47`	`48`	# The `NeutronQuantizer` should remove the dead code in the `transform_for_annotation()` method.
	`49`	`+ quantizer = NeutronQuantizer()`
`48`	`50`	`exir_program_aten_quant = _quantize_model(`
`49`		`- exir_program_aten.module(), [example_inputs]`
	`51`	`+ exir_program_aten.module(), quantizer, [example_inputs]`
`50`	`52`	`)`
`51`	`53`
`52`	`54`	# Make sure the is no `add` operation in the graph anymore.