Update on "Arm backend: Add 16A8W support and test for mul operation"

Ninja91 · Ninja91 · commit 222f96fbccb2 · 2025-09-06T18:30:19.000-07:00
Add 16A8W quantization support and test for the mul operation in ExecutorTorch ARM backend. This follows the pattern established for linear operations, extending int16 support to mul operations. Changes: - Add INT16 dtype validation support in op_mul.py - Add test_mul_tensor_16a8w_tosa_INT test function - Enable test_mul.py in test targets configuration The 16A8W configuration uses 16-bit activations with 8-bit weights, enabling higher precision for activations while maintaining weight efficiency. Differential Revision: [D80510628](https://our.internmc.facebook.com/intern/diff/D80510628/) cc digantdesai freddan80 per zingo oscarandersson8218 [ghstack-poisoned]
diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
@@ -34,6 +34,7 @@ class MulVisitor_INT(NodeVisitor):
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
     ]
 
     def define_node(
@@ -55,7 +56,7 @@ def define_node(
             output.tosa_spec,
         )
 
-        if inputs[0].dtype == ts.DType.INT8:
+        if inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.INT16:
             input_A = inputs[0]
             input_B = inputs[1]
             input_qparams = get_input_qparams(node)
@@ -84,11 +85,11 @@ def define_node(
             # Non quantized input, natively support by TOSA.MUL
             input_A_rescaled, input_B_rescaled = inputs[0], inputs[1]
 
-        if output.dtype == ts.DType.INT8:
+        if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16:
             output_shape = tutils.tosa_shape(output.shape, output.dim_order)
             mul_output = tosa_graph.addIntermediate(output_shape, ts.DType.INT32)
         else:
-            # output.dtype == ts.DType.INT16 or ts.DType.INT32
+            # output.dtype == ts.DType.INT32 (non-quantized)
             mul_output = output
 
         # Do the INT32 Mul
@@ -110,6 +111,15 @@ def define_node(
             tqutils.insert_rescale_op_to_int8(
                 tosa_graph, mul_output, output_scale, node, self.tosa_spec
             )
+        elif output.dtype == ts.DType.INT16:
+            # Scale output back to 16 bit
+            output_scale = (
+                input_A_qargs.get_scale_per_tensor()  # type: ignore[possibly-undefined]
+                * input_B_qargs.get_scale_per_tensor()  # type: ignore[possibly-undefined]
+            )
+            tqutils.insert_rescale_op_to_int16(
+                tosa_graph, mul_output, output_scale, node, self.tosa_spec
+            )
 
 
 @register_node_visitor
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
@@ -262,7 +262,7 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
 
 @common.parametrize("test_data", Add.test_data)
 @pytest.mark.xfail(
-    reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13969"
+    reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"
 )
 def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
diff --git a/backends/arm/tosa/quant_utils.py b/backends/arm/tosa/quant_utils.py
@@ -140,6 +140,58 @@ def insert_rescale_op_to_int8(
         compute_rescale: boolean indicating whether we need to divide the output scale by the original scale.
         tosa_graph: the tosa_graph to manipulate.
 
+    This functions is used in serialization to TOSA for target ops that are
+    handled by the DQ/D folding pass, which stores the quantization parameters
+    in the node meta dict.
+    """
+    _insert_rescale_op_to_dtype(
+        tosa_graph, last_tensor, scale, node, ts.DType.INT8, compute_rescale, tosa_spec
+    )
+
+
+def insert_rescale_op_to_int16(
+    tosa_graph: Any,
+    last_tensor: TosaArg,
+    scale: float,
+    node: Node,
+    compute_rescale=True,
+    tosa_spec=None,
+) -> None:
+    """Rescales the node back to int16, adding a suitable RESCALE op to 'tosa_graph'.
+    Parameters:
+        node: The original node that is being handled by the rescales.
+        last_tensor:the tosa tensor to rescale back.
+        scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32'
+        compute_rescale: boolean indicating whether we need to divide the output scale by the original scale.
+        tosa_graph: the tosa_graph to manipulate.
+
+    This functions is used in serialization to TOSA for target ops that are
+    handled by the DQ/D folding pass, which stores the quantization parameters
+    in the node meta dict.
+    """
+    _insert_rescale_op_to_dtype(
+        tosa_graph, last_tensor, scale, node, ts.DType.INT16, compute_rescale, tosa_spec
+    )
+
+
+def _insert_rescale_op_to_dtype(
+    tosa_graph: Any,
+    last_tensor: TosaArg,
+    scale: float,
+    node: Node,
+    output_dtype: Any,
+    compute_rescale=True,
+    tosa_spec=None,
+) -> None:
+    """Common implementation for rescaling nodes back to a specific dtype.
+    Parameters:
+        node: The original node that is being handled by the rescales.
+        last_tensor:the tosa tensor to rescale back.
+        scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32'
+        output_dtype: The target dtype (ts.DType.INT8 or ts.DType.INT16)
+        compute_rescale: boolean indicating whether we need to divide the output scale by the original scale.
+        tosa_graph: the tosa_graph to manipulate.
+
     This functions is used in serialization to TOSA for target ops that are
     handled by the DQ/D folding pass, which stores the quantization parameters
     in the node meta dict.
@@ -158,13 +210,14 @@ def insert_rescale_op_to_int8(
     else:
         output_rescale_scale = scale
 
-    # Rescale Back to INT8
-    build_rescale_from_int32(
+    # Rescale Back to the specified dtype
+    build_rescale_from_int32_to_dtype(
         tosa_graph,
         last_tensor,
         node.name,
         qargs_out.get_zp_per_tensor(),
         output_rescale_scale,
+        output_dtype,
         tosa_spec=tosa_spec,
     )
 
@@ -337,14 +390,55 @@ def build_rescale_from_int32(
     per_channel: bool = False,
     tosa_spec=None,
 ) -> None:
+    # For TOSA v1.0 multipliers, shifts, input_zp and output_zp are now inputs
+    # to the RESCALE op see: https://www.mlplatform.org/tosa/tosa_spec.html#_rescale
+    build_rescale_from_int32_to_dtype(
+        tosa_fb,
+        input_node,
+        output_name,
+        output_zp,
+        rescale_scale,
+        ts.DType.INT8,
+        is_scale32,
+        is_double_round,
+        per_channel,
+        tosa_spec,
+    )
+
+    return
+
+
+def build_rescale_from_int32_to_dtype(
+    tosa_fb: Any,
+    input_node: TosaArg,
+    output_name: str,
+    output_zp: int,
+    rescale_scale: float,
+    output_dtype: Any,
+    is_scale32: bool = True,
+    is_double_round: bool = False,
+    per_channel: bool = False,
+    tosa_spec=None,
+) -> None:
+    """Common implementation for rescaling from INT32 to a specific dtype (INT8 or INT16).
+
+    Parameters:
+        tosa_fb: The TOSA serializer
+        input_node: Input tensor (should be INT32)
+        output_name: Name for the output tensor
+        output_zp: Output zero point
+        rescale_scale: Rescaling factor
+        output_dtype: Target dtype (ts.DType.INT8 or ts.DType.INT16)
+        Other parameters: Standard rescale parameters
+    """
     # For TOSA v1.0 multipliers, shifts, input_zp and output_zp are now inputs
     # to the RESCALE op see: https://www.mlplatform.org/tosa/tosa_spec.html#_rescale
     build_rescale(
         tosa_fb,
         [rescale_scale],
         input_node,
         output_name=output_name,
-        output_type=ts.DType.INT8,
+        output_type=output_dtype,
         input_zp=[0],
         output_zp=[output_zp],
         rounding_mode=RoundingMode.SINGLE_ROUND,

Original file line number	Diff line number	Diff line change
`@@ -262,7 +262,7 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):`
`262`	`262`
`263`	`263`	`@common.parametrize("test_data", Add.test_data)`
`264`	`264`	`@pytest.mark.xfail(`
`265`		`- reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13969"`
	`265`	`+ reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"`
`266`	`266`	`)`
`267`	`267`	`def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):`
`268`	`268`	`"""Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""`