Add 16A8W quantization configuration utility for ARM backend (#13175)

Ninja91 · facebook-github-bot · commit 07879f13c0e9 · 2025-08-06T23:11:38.000-07:00
Summary: Pull Request resolved: #13175 This diff implements a 16A8W (16-bit activations, 8-bit weights) quantization configuration utility for the ExecutorTorch ARM backend, following the feedback from D79746479. ## Key Changes **1. New Quantization Configuration Function** - Add `get_16a8w_quantization_config()` in `fbcode/executorch/backends/arm/quantizer/arm_quantizer.py` - Provides 16-bit activations with HistogramObserver (better precision than 8A8W) - Maintains 8-bit weights with MinMaxObserver/PerChannelMinMaxObserver (memory efficient) - **Technically supported by TOSA through [EXT-INT16 extension/profile](https://www.mlplatform.org/tosa/tosa_spec.html#_conv2d)** **2. Test Implementation** - Add `test_linear_16a8w_tosa_INT()` test in `fbcode/executorch/backends/arm/test/ops/test_linear.py` - Demonstrates usage of new 16A8W quantization configuration ## Benefits - **Better Precision**: 16-bit activations provide higher precision than 8-bit. Useful for carrying precision for recurring neural nets. - **Configurable**: Supports same parameters as existing quantization configurations ## Testing The implementation provides the utility function and test infrastructure. Note: The test reveals that TOSA backend has limited INT16 support for some operations (view operations only support INT8/INT32/FP32/BOOL), which is expected and shows the configuration correctly produces INT16 tensors. Differential Revision: D79763381
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -144,6 +144,111 @@ def get_symmetric_quantization_config(
     return quantization_config
 
 
+@functools.lru_cache
+def get_16a8w_quantization_config(
+    is_per_channel: bool = True,
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
+):
+    """
+    16A8W quantization config: 16-bit activations, 8-bit weights.
+
+    This configuration provides better accuracy than 8A8W while maintaining
+    reasonable memory usage through 8-bit weights.
+
+    Args:
+        is_per_channel: Whether to use per-channel quantization for weights
+        is_qat: Whether this is for Quantization Aware Training
+        is_dynamic: Whether to use dynamic quantization
+        weight_qmin: Minimum quantization value for weights
+        weight_qmax: Maximum quantization value for weights
+
+    Returns:
+        QuantizationConfig with 16-bit activations and 8-bit weights
+    """
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+
+    # Setup observer/fake-quant for 16-bit activations
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            # HistogramObserver works well for 16-bit range
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    # 16-bit activation quantization spec
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int32,
+        quant_min=torch.iinfo(torch.int16).min,  # -32768
+        quant_max=torch.iinfo(torch.int16).max,  # 32767
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args,
+        ),
+    )
+
+    # Setup quantization config for weights (same as 8A8W - use 8-bit weights)
+    weight_qscheme = (
+        torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric
+    )
+    weight_observer_or_fake_quant_ctr: ObserverOrFakeQuantizeConstructor = (
+        MinMaxObserver
+    )
+    # Determine the right observer/fake-quant constructor
+    if is_qat:
+        # Set plain fake-quant with true min/max
+        weight_observer_or_fake_quant_ctr = FakeQuantize
+    else:
+        # PTQ: set min/max observer
+        weight_observer_or_fake_quant_ctr = (
+            PerChannelMinMaxObserver if is_per_channel else MinMaxObserver
+        )
+
+    weight_extra_args = {"eps": 2**-12}
+
+    # 8-bit weight quantization spec (keep weights at 8-bit for memory efficiency)
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=weight_qmin,
+        quant_max=weight_qmax,
+        qscheme=weight_qscheme,
+        ch_axis=0,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **weight_extra_args
+        ),
+    )
+
+    bias_quantization_spec = None
+    if is_dynamic:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,  # 16-bit input activations
+            None,
+            weight_quantization_spec,  # 8-bit weights
+            bias_quantization_spec,
+        )
+    else:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,  # 16-bit input activations
+            act_quantization_spec,  # 16-bit output activations
+            weight_quantization_spec,  # 8-bit weights
+            bias_quantization_spec,
+        )
+    return quantization_config
+
+
 NodeFilterType = Callable[[Node], bool]
 """Type for a Node Filter used by annotators. A Node filter is a function that takes
     a Node and returns whether the node should be annotated or not.
@@ -217,7 +322,6 @@ def not_module_type_or_name_filter(n: Node) -> bool:
 
 
 class TOSAQuantizer(Quantizer):
-
     def __init__(
         self, compile_spec_or_tosa_spec: Union[TosaSpecification, List[CompileSpec]]
     ) -> None:
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
@@ -11,6 +11,9 @@
 import pytest
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_16a8w_quantization_config,
+)
 from executorch.backends.arm.test import common
 
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -258,3 +261,33 @@ def test_linear_vgf_INT(test_data: torch.Tensor):
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
+
+
+@pytest.mark.xfail(
+    reason="TOSA backend has limited INT16 support - view operations only support INT8/INT32/FP32/BOOL"
+)
+@common.parametrize("test_data", test_data_rank1_INT)
+def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
+    """Test linear operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
+    test_data, out_features, has_bias, per_channel_quantization = test_data()
+    in_features = test_data.shape[-1]
+
+    # Create pipeline with custom 16A8W quantization config
+    pipeline = TosaPipelineINT[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        quantization_config=get_16a8w_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+    # Run the pipeline
+    pipeline.run()
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
@@ -107,7 +107,6 @@ def __init__(
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
         ] = None,
     ):
-
         self.tester = ArmTester(
             module,
             example_inputs=test_data,
@@ -306,6 +305,7 @@ def __init__(
         rtol: float = 1e-03,
         qtol: int = 1,
         dynamic_shapes: Optional[Tuple[Any]] = None,
+        quantization_config: Optional[Any] = None,
     ):
         tosa_profiles = {
             "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT"),
@@ -317,9 +317,11 @@ def __init__(
         )
 
         quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
-        quantization_config = get_symmetric_quantization_config(
-            is_per_channel=per_channel_quantization
-        )
+        # Use custom quantization config if provided, otherwise use default
+        if quantization_config is None:
+            quantization_config = get_symmetric_quantization_config(
+                is_per_channel=per_channel_quantization
+            )
         if symmetric_io_quantization:
             quantizer.set_io(quantization_config)
         quant_stage = Quantize(quantizer, quantization_config)
@@ -856,7 +858,6 @@ def __init__(
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
         ] = None,
     ):
-
         tosa_profile = TosaSpecification.create_from_string(tosa_version)
         compile_spec = common.get_vgf_compile_spec(
             tosa_profile, compiler_flags=vgf_compiler_flags, custom_path=custom_path