Arm backend: Make per-channel quantization default

Martin Lindström · Martin Lindström · commit c3567b4608b1 · 2025-06-27T09:52:55.000+02:00
Support for per-channel quantization was recently added to the Arm
backend. This patch changes the default setting to use per-channel
quantization for weights in convolutional and linear layers, instead of
per-tensor quantization, which was the previous default.

The reason for this change is that per-channel quantization offers
better numerical accuracy for models containing convolutional and/or
fully connected layers. Unless there is an explicit limitation in the
use case that prevents the use of per-channel quantization, it is
generally preferred.

The option to set quantization granularity can still be manually set
using `get_symmetric_quantization_config(is_per_channel=False)`. This
patch only changes the default.

Unit and model tests are affected by this change. Error tolerances for
those tests have not been changed, as model outputs are compared against
a reference that uses the exact same quantization strategy. That is, if
a model output is altered by this patch, the reference it is compared
against would also be altered accordingly.

To verify the impact of this change in terms of top-1 and top-5
accuracy, a manual test was run on MobileNetV2. The results show a
noticeable improvement:

- Per-tensor quantization Top-1 / Top-5 accuracy: 66.45% / 87.50%
- Per-channel quantization Top-1 / Top-5 accuracy: 70.85% / 89.50%

Change-Id: I35d5c62741c7f93b916560874689245db96a588b
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -60,7 +60,7 @@
 
 @functools.lru_cache
 def get_symmetric_quantization_config(
-    is_per_channel: bool = False,
+    is_per_channel: bool = True,
     is_qat: bool = False,
     is_dynamic: bool = False,
     act_qmin: int = -128,
diff --git a/backends/arm/test/misc/test_bn_relu_folding_qat.py b/backends/arm/test/misc/test_bn_relu_folding_qat.py
@@ -59,7 +59,9 @@ def test_qat_tosa_BI(model: torch.nn.Module):
         "quantize",
         Quantize(
             quantizer=quantizer,
-            quantization_config=get_symmetric_quantization_config(is_qat=True),
+            quantization_config=get_symmetric_quantization_config(
+                is_qat=True, is_per_channel=False
+            ),
             is_qat=True,
         ),
     )
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -46,7 +46,6 @@ def test_mv2_tosa_BI():
         aten_op=[],
         exir_op=[],
         use_to_edge_transform_and_lower=True,
-        per_channel_quantization=True,
         atol=0.25,
         qtol=1,
     )
@@ -63,7 +62,6 @@ def test_mv2_u55_BI():
         exir_ops=[],
         run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
-        per_channel_quantization=True,
         atol=0.25,
         qtol=1,
     )
@@ -80,7 +78,6 @@ def test_mv2_u85_BI():
         exir_ops=[],
         run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
-        per_channel_quantization=True,
         atol=0.25,
         qtol=1,
     )
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -384,6 +384,10 @@ def forward(self, x):
     "2x2_3x2x40x40_nobias": "MLETORCH-520: Numerical issues on FVP.",
     "5x5_3x2x128x128_st1": "MLETORCH-520: Numerical issues on FVP.",
 }
+per_channel_quant_xfails = {
+    "groups": "MLETORCH-1144: Invalid TOSA Graph",
+    "groups_bias": "MLETORCH-1144: Invalid TOSA Graph",
+}
 input_t = Tuple[torch.Tensor]
 
 
@@ -398,7 +402,7 @@ def test_convolution_2d_tosa_MI(test_module):
     pipeline.run()
 
 
-@common.parametrize("test_module", test_modules)
+@common.parametrize("test_module", test_modules, per_channel_quant_xfails)
 def test_convolution_2d_tosa_BI(test_module):
     pipeline = TosaPipelineBI[input_t](
         test_module(),
@@ -410,7 +414,7 @@ def test_convolution_2d_tosa_BI(test_module):
     pipeline.run()
 
 
-@common.parametrize("test_module", test_modules, fvp_xfails)
+@common.parametrize("test_module", test_modules, fvp_xfails | per_channel_quant_xfails)
 @common.XfailIfNoCorstone300
 def test_convolution_2d_u55_BI(test_module):
     pipeline = EthosU55PipelineBI[input_t](
@@ -423,7 +427,7 @@ def test_convolution_2d_u55_BI(test_module):
     pipeline.run()
 
 
-@common.parametrize("test_module", test_modules, fvp_xfails)
+@common.parametrize("test_module", test_modules, fvp_xfails | per_channel_quant_xfails)
 @common.XfailIfNoCorstone320
 def test_convolution_2d_u85_BI(test_module):
     pipeline = EthosU85PipelineBI[input_t](
diff --git a/backends/arm/test/ops/test_multihead_attention.py b/backends/arm/test/ops/test_multihead_attention.py
@@ -53,7 +53,14 @@ def test_multihead_attention_tosa_MI(test_data: input_t1):
 )
 def test_multihead_attention_tosa_BI(test_data):
     test_data, module = test_data()
-    pipeline = TosaPipelineBI(module, (*test_data, *test_data, *test_data), [], [])
+    pipeline = TosaPipelineBI(
+        module,
+        (*test_data, *test_data, *test_data),
+        [],
+        [],
+        # TODO: Per-channel quantization is broken (MLETORCH-1144)
+        per_channel_quantization=False,
+    )
     pipeline.run()
 
 
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
@@ -299,7 +299,7 @@ def __init__(
         run_on_tosa_ref_model: bool = True,
         tosa_version: str = "TOSA-0.80+BI",
         symmetric_io_quantization: bool = False,
-        per_channel_quantization: bool = False,
+        per_channel_quantization: bool = True,
         use_to_edge_transform_and_lower: bool = True,
         custom_path: str = None,
         atol: float = 1e-03,
@@ -316,16 +316,14 @@ def __init__(
         compile_spec = common.get_tosa_compile_spec(
             tosa_profiles[tosa_version], custom_path=custom_path
         )
-        if symmetric_io_quantization or per_channel_quantization:
-            quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
-            quantization_config = get_symmetric_quantization_config(
-                is_per_channel=per_channel_quantization
-            )
-            if symmetric_io_quantization:
-                quantizer.set_io(quantization_config)
-            quant_stage = Quantize(quantizer, quantization_config)
-        else:
-            quant_stage = None
+
+        quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=per_channel_quantization
+        )
+        if symmetric_io_quantization:
+            quantizer.set_io(quantization_config)
+        quant_stage = Quantize(quantizer, quantization_config)
 
         super().__init__(
             module,
@@ -474,24 +472,21 @@ def __init__(
         exir_ops: Optional[str | List[str]] = None,
         run_on_fvp: bool = True,
         symmetric_io_quantization: bool = False,
-        per_channel_quantization: bool = False,
+        per_channel_quantization: bool = True,
         use_to_edge_transform_and_lower: bool = True,
         custom_path: str = None,
         atol: float = 1e-03,
         rtol: float = 1e-03,
         qtol: int = 1,
     ):
         compile_spec = common.get_u55_compile_spec(custom_path=custom_path)
-        if symmetric_io_quantization or per_channel_quantization:
-            quantizer = EthosUQuantizer(compile_spec)
-            quantization_config = get_symmetric_quantization_config(
-                is_per_channel=per_channel_quantization
-            )
-            if symmetric_io_quantization:
-                quantizer.set_io(quantization_config)
-            quant_stage = Quantize(quantizer, quantization_config)
-        else:
-            quant_stage = None
+        quantizer = EthosUQuantizer(compile_spec)
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=per_channel_quantization
+        )
+        if symmetric_io_quantization:
+            quantizer.set_io(quantization_config)
+        quant_stage = Quantize(quantizer, quantization_config)
 
         super().__init__(
             module,
@@ -564,24 +559,21 @@ def __init__(
         exir_ops: str | List[str] = None,
         run_on_fvp: bool = True,
         symmetric_io_quantization: bool = False,
-        per_channel_quantization: bool = False,
+        per_channel_quantization: bool = True,
         use_to_edge_transform_and_lower: bool = True,
         custom_path: str = None,
         atol: float = 1e-03,
         rtol: float = 1e-03,
         qtol: int = 1,
     ):
         compile_spec = common.get_u85_compile_spec(custom_path=custom_path)
-        if symmetric_io_quantization or per_channel_quantization:
-            quantizer = EthosUQuantizer(compile_spec)
-            quantization_config = get_symmetric_quantization_config(
-                is_per_channel=per_channel_quantization
-            )
-            if symmetric_io_quantization:
-                quantizer.set_io(quantization_config)
-            quant_stage = Quantize(quantizer, quantization_config)
-        else:
-            quant_stage = None
+        quantizer = EthosUQuantizer(compile_spec)
+        quantization_config = get_symmetric_quantization_config(
+            is_per_channel=per_channel_quantization
+        )
+        if symmetric_io_quantization:
+            quantizer.set_io(quantization_config)
+        quant_stage = Quantize(quantizer, quantization_config)
 
         super().__init__(
             module,
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
@@ -160,8 +160,7 @@ def quantize(
     else:
         raise RuntimeError("Unsupported compilespecs for quantization!")
 
-    # if we set is_per_channel to True, we also need to add out_variant of quantize_per_channel/dequantize_per_channel
-    operator_config = get_symmetric_quantization_config(is_per_channel=False)
+    operator_config = get_symmetric_quantization_config()
     quantizer.set_global(operator_config)
     m = prepare_pt2e(model, quantizer)
 
diff --git a/examples/arm/ethos_u_minimal_example.ipynb b/examples/arm/ethos_u_minimal_example.ipynb
@@ -101,7 +101,7 @@
     "\n",
     "# Create and configure quantizer to use a symmetric quantization config globally on all nodes\n",
     "quantizer = EthosUQuantizer(compile_spec)\n",
-    "operator_config = get_symmetric_quantization_config(is_per_channel=False)\n",
+    "operator_config = get_symmetric_quantization_config()\n",
     "quantizer.set_global(operator_config)\n",
     "\n",
     "# Post training quantization\n",