Arm backend: support int16 and int32 output tables.

Erik-Lundell · zingo · commit c9d1f5503f94 · 2025-03-20T15:58:00.000+01:00
Change insert_table_ops to support int16 table generation.
Insert rescale in insert_table_ops if needed.
Remove guard for int16 on insert_rescales.
The new test shows how the quantizer can be configured
to change the quantization of Sigmoid.

Signed-off-by: Erik Lundell &lt;erik.lundell@arm.com&gt;
Change-Id: Ice9e1f2c2db089191df3bd1a22eff0bfe3048520
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
@@ -38,17 +38,17 @@ def rescale_fake(
     """Casts the input tensor to dtype `dtype` to produce the correct tensor meta for a _rescale op.
     Additionally validates TOSA constraints of a RESCALE op.
     """
-    if not (dtype == torch.int32 or dtype == torch.int8):
+    if dtype not in (torch.int32, torch.int8, torch.int16):
         raise NotImplementedError(
-            "tosa::rescale currently only supports int32 and int8."
+            f"tosa::rescale currently only supports int32, int16 and int8, not {dtype}"
         )
-    if dtype == torch.int32 and out_zp != 0:
+    if dtype in (torch.int32, torch.int16) and out_zp != 0:
         raise ValueError(
-            "TOSA requires output_zp to be zero when the output dtype is int32."
+            f"TOSA requires output_zp to be zero when the output dtype is {dtype}."
         )
-    if x.dtype == torch.int32 and in_zp != 0:
+    if x.dtype in (torch.int32, torch.int16) and in_zp != 0:
         raise ValueError(
-            "TOSA requires input_zp to be zero when the input dtype is int32."
+            f"TOSA requires input_zp to be zero when the input dtype is {dtype}"
         )
     if x.dtype == torch.int8 and not -128 <= in_zp <= 127:
         raise ValueError(f"{in_zp=} outside valid range (-128,127) for int8.")
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
@@ -1,5 +1,4 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,6 +17,7 @@
 
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
+
 from torch.library import impl, Library
 
 lib = Library("tosa", "DEF")
@@ -26,7 +26,10 @@
 
 @impl(lib, "_table")
 def _table_impl(*args, **kwargs):  # pyre-ignore
-    return args[0]
+    in_dtype = args[0].dtype
+    if in_dtype == torch.int8:
+        return args[0]
+    return args[0].to(dtype=torch.int32)
 
 
 class InsertTableOpsPass(ExportPass):
@@ -59,29 +62,89 @@ def register_buffer(self, buffer_name: str, buffer: torch.Tensor) -> None:
         """
         self.exported_program.state_dict[buffer_name] = buffer
 
-    def generate_table_values(
+    def generate_8bit_table_values(
         self,
         torch_op: Callable[[torch.Tensor], torch.Tensor],
         in_quantargs: QuantArgs,
         out_quantargs: QuantArgs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT8 TOSA.TABLE. Also returns 0 since no shifting is required after 8bit table.
+        The INT8 table is a simple 256 value 1-1 LUT.
+        """
+
         def f(x: torch.Tensor) -> torch.Tensor:
             x = in_quantargs.dequantize_value(x)
             x = torch_op(x)
             return out_quantargs.quantize_value(x)
 
-        input_dtype = in_quantargs.dtype
-        steps = in_quantargs.qmax - in_quantargs.qmin + 1
-        return f(
+        return (
+            f(
+                torch.linspace(
+                    start=in_quantargs.qmin,
+                    end=in_quantargs.qmax,
+                    steps=256,
+                    # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
+                    # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
+                    dtype=torch.int64,
+                )
+            ).to(dtype=torch.int8),
+            0,
+        )
+
+    def generate_16_bit_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        """Compute LUT values for a INT16 TOSA.TABLE with 32 bit output (in practice 23 bit, see specification).
+        The output of the the table will have 7 fractional bits, which means the output will interpreted as
+        x128 times too large unless accounted for. Right shift the table values to fit
+        in 16 bits. Return a lshift of the right shift - 7 due to the fractional bits.
+        """
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            # Dont use the 7 LSBs
+            x = in_quantargs.dequantize_value((x & ~0x7F))
+            x = torch_op(x)
+            return out_quantargs.quantize_value(x)
+
+        lut_values = f(
             torch.linspace(
                 start=in_quantargs.qmin,
-                end=in_quantargs.qmax,
-                steps=steps,
+                end=in_quantargs.qmax + 1,
+                steps=513,
                 # use torch.int64 to avoid overflow when dequantizing (subtracting zp).
                 # e.g. torch.tensor(-50, dtype=torch.int8) - 100 == torch.tensor(106, dtype=torch.int8)
                 dtype=torch.int64,
             )
-        ).to(dtype=input_dtype)
+        )
+        # Calculate how much we need to shift table values to fit in 16 bits
+        # ceil(log2(max absolute table value)) + 1 bit for signedness - 16
+        # Note: for out_quantargs.dtype=torch.int16, rshift == 0.
+        rshift = int(torch.ceil(torch.log2(lut_values.abs().max()))) + 1 - 16
+        lut_values = lut_values >> rshift
+        return lut_values.to(dtype=torch.int16), rshift - 7
+
+    def generate_table_values(
+        self,
+        torch_op: Callable[[torch.Tensor], torch.Tensor],
+        in_quantargs: QuantArgs,
+        out_quantargs: QuantArgs,
+    ) -> tuple[torch.Tensor, int]:
+        match out_quantargs.dtype:
+            case torch.int8:
+                return self.generate_8bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case torch.int16 | torch.int32:
+                return self.generate_16_bit_table_values(
+                    torch_op, in_quantargs, out_quantargs
+                )
+            case _:
+                raise ValueError(
+                    f"Unsupported output dtype for table: {out_quantargs.dtype}"
+                )
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
@@ -100,10 +163,12 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     op_target=torch.ops.tosa._table.default,
                     args=(node.args[0],),
                 )
+                output_node = table_node
                 assert len(input_qparams) == 1
                 assert len(output_qparams) == 1
-                # Generate table buffer
-                buffer = self.generate_table_values(
+
+                # Generate table buffer and how much to lshift the table output.
+                buffer, lshift = self.generate_table_values(
                     torch_op=self.table_ops[node.target],
                     in_quantargs=input_qparams[0],
                     out_quantargs=output_qparams[0],
@@ -114,10 +179,20 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 self.register_buffer(
                     buffer_name=table_node.name.replace("_default", ""), buffer=buffer
                 )
-                node.replace_all_uses_with(table_node)
+
+                if lshift != 0:
+                    scale = 2.0**lshift
+                    rescale_node = create_node(
+                        graph=graph_module.graph,
+                        op_target=torch.ops.tosa._rescale.default,
+                        args=(table_node, output_qparams[0].dtype, scale, 0, 0),
+                    )
+                    output_node = rescale_node
+
+                node.replace_all_uses_with(output_node)
             graph_module.graph.erase_node(node)
-            table_node.meta["input_qparams"] = input_qparams
-            table_node.meta["output_qparams"] = output_qparams
+            output_node.meta["input_qparams"] = input_qparams
+            output_node.meta["output_qparams"] = output_qparams
             modified = True
 
         if modified:
diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py
@@ -30,7 +30,7 @@ class NodeVisitor:
     ]
 
     def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):
-        self._exported_program = exported_program or None
+        self._exported_program = exported_program
         self.tosa_spec = tosa_spec
 
     def define_node(
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_rescale.py
@@ -38,7 +38,6 @@ def define_node(
         input_zp = cast(int, node.args[3])
         output_zp = cast(int, node.args[4])
 
-        # Skip int16 cases for now.
         if input_dtype != map_dtype(torch.int8) and input_zp != 0:
             raise ValueError(
                 f"If input dtype is not int8, input_zp must be 0. Got input_dtype{ts.DTypeNames[input_dtype]}, {input_zp=}"
@@ -48,7 +47,10 @@ def define_node(
                 f"If output dtype is not int8, output_zp must be 0. Got {output_dtype=}, {output_zp=}"
             )
 
-        scale_width = 32 if output_dtype == torch.int32 else 16
+        # scale32 gives higher accuracy but for a higher HW cost.
+        # For now, always go for scale32.
+        scale_32 = True
+        scale_width = 32 if scale_32 else 16
         multiplier, shift = tosa_quant_utils.compute_multiplier_and_shift(
             scale, scale_width
         )
@@ -58,7 +60,7 @@ def define_node(
             output_zp=output_zp,
             multiplier=[multiplier],
             shift=[shift],
-            scale32=output_dtype == torch.int32,
+            scale32=scale_32,
             double_round=False,
             per_channel=False,
             input_unsigned=False,
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_table.py
@@ -30,11 +30,24 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-        assert node.name in self._exported_program.state_dict.keys()  # type: ignore[union-attr]
-        assert inputs[0].dtype == output.dtype == ts.DType.INT8
+        if node.name not in self._exported_program.state_dict.keys():  # type: ignore[union-attr]
+            raise RuntimeError(
+                f"Did not find key {node.name} in state_dict {self._exported_program.state_dict.keys()}."
+            )
+        if inputs[0].dtype == ts.DType.INT8 and output.dtype != ts.DType.INT8:
+            raise ValueError(f"Int8 tables need int8 output, got {output.dtype=}.")
+        if inputs[0].dtype == ts.DType.INT16 and output.dtype != ts.DType.INT32:
+            raise ValueError(f"Int16 tables need int32 output, got {output.dtype=}.")
+
+        if inputs[0].dtype not in (ts.DType.INT8, ts.DType.INT16):
+            raise ValueError(
+                f"TOSA.TABLE only supports int8 or int16 inputs, got {ts.DTypeNames[inputs[0]]}"
+            )
+
         table = self._exported_program.state_dict[node.name]  # type: ignore[union-attr]
         table_attr = ts.TosaSerializerAttribute()
         table_attr.TableAttribute(np.array(table))
+
         tosa_graph.addOperator(
             TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr
         )
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ class NodeVisitor:`
`30`	`30`	`]`
`31`	`31`
`32`	`32`	`def __init__(self, exported_program: ExportedProgram, tosa_spec: TosaSpecification):`
`33`		`- self._exported_program = exported_program or None`
	`33`	`+ self._exported_program = exported_program`
`34`	`34`	`self.tosa_spec = tosa_spec`
`35`	`35`
`36`	`36`	`def define_node(`