Update on "Arm backend: Add 16A8W support and test for add operation"

Ninja91 · Ninja91 · commit acd368f382cc · 2025-09-04T07:48:02.000-07:00
Add 16A8W quantization support and comprehensive tests for the add operation in ExecutorTorch ARM backend targeting Ethos U55 and U85 NPUs. This follows the pattern established for linear operations, extending int16 support to add operations with hardware-specific testing. Changes: - Add INT16 dtype validation support in op_add.py - Add test_add_tensor_16a8w_tosa_INT test function with U55/U85 pipeline support - Add U55 and U85 specific 16A8W tests with proper xfail decorators - Fix U55/U85 test parameter usage (remove unsupported tosa_extensions, clean quantizer function calls) - Update xfail reasons to consistent 'Vela compilation fails with Invalid arguments' pattern Differential Revision: [D80510463](https://our.internmc.facebook.com/intern/diff/D80510463) cc digantdesai freddan80 per zingo oscarandersson8218 [ghstack-poisoned]
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
@@ -65,15 +65,15 @@ def define_node(
                 tosa_graph, inputs, node, self.tosa_spec
             )
         else:
-            # input[0].dtype == ts.DType.INT32
+            # input[0].dtype == ts.DType.INT16 or ts.DType.INT32
             # Non quantized input, natively support by TOSA.ADD
             rescaled_inputs = inputs
 
         if output.dtype == ts.DType.INT8:
             broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
             add_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
         else:
-            # output.dtype == ts.DType.INT32
+            # output.dtype == ts.DType.INT16 or ts.DType.INT32
             add_output = output
 
         input1, input2 = rescaled_inputs
@@ -123,7 +123,7 @@ def define_node(
         validate_num_inputs(self.target, inputs, 2)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
-        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
+        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
             super().define_node(node, tosa_graph, inputs, output)
         else:
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -225,6 +225,86 @@ def get_symmetric_a16w8_quantization_config(
     return quantization_config
 
 
+@functools.lru_cache
+def get_symmetric_a16w8_quantization_config(
+    is_per_channel: bool = True,
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
+):
+    """
+    16A8W quantization config: 16-bit activations, 8-bit weights.
+
+    This configuration provides better accuracy than 8A8W while maintaining
+    reasonable memory usage through 8-bit weights.
+
+    Args:
+        is_per_channel: Whether to use per-channel quantization for weights
+        is_qat: Whether this is for Quantization Aware Training
+        is_dynamic: Whether to use dynamic quantization
+        weight_qmin: Minimum quantization value for weights
+        weight_qmax: Maximum quantization value for weights
+
+    Returns:
+        QuantizationConfig with 16-bit activations and 8-bit weights
+    """
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+
+    # Setup observer/fake-quant for 16-bit activations
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            # HistogramObserver works well for 16-bit range
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    # 16-bit activation quantization spec
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int16,
+        quant_min=torch.iinfo(torch.int16).min,  # -32768
+        quant_max=torch.iinfo(torch.int16).max,  # 32767
+        qscheme=torch.per_tensor_symmetric,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args,
+        ),
+    )
+
+    # Instead of reconstructing quantization_config, just clone and update as needed
+    # Clone the quantization_config from get_symmetric_quantization_config and update activation spec
+    base_config = get_symmetric_quantization_config(
+        is_per_channel=is_per_channel,
+        is_qat=is_qat,
+        is_dynamic=is_dynamic,
+    )
+    # Replace activation quantization spec with 16-bit version
+    if is_dynamic:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,  # 16-bit input activations
+            None,
+            base_config.weight,  # 8-bit weights from base config
+            None,
+        )
+    else:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,  # 16-bit input activations
+            act_quantization_spec,  # 16-bit output activations
+            base_config.weight,  # 8-bit weights from base config
+            None,
+        )
+    return quantization_config
+
+
 NodeFilterType = Callable[[Node], bool]
 """Type for a Node Filter used by annotators. A Node filter is a function that takes
     a Node and returns whether the node should be annotated or not.