Arm backend: Enable int16x8 quantization on aot_arm_compiler (#15811)

SaoirseARM · web-flow · commit 0601b7f6c5e7 · 2025-11-13T19:29:46.000+01:00
### Summary Adds int16x8 target to aot_arm_compiler to enable accuracy testing in backends/arm/util/arm_model_evaluator.py - renaming int8 -> quant for model name cc @freddan80 @per @zingo @oscarandersson8218 @digantdesai Signed-off-by: Saoirse Stewart <saoirse.stewart@arm.com>
diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
@@ -167,14 +167,14 @@ def __init__(
         self,
         model_name: str,
         fp32_model: torch.nn.Module,
-        int8_model: torch.nn.Module,
+        quant_model: torch.nn.Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: Optional[str],
     ) -> None:
         self.model_name = model_name
 
         self.fp32_model = fp32_model
-        self.int8_model = int8_model
+        self.quant_model = quant_model
         self.example_input = example_input
 
         if tosa_output_path:
@@ -192,12 +192,12 @@ def get_model_error(self) -> defaultdict:
             mean_absolute_error
         """
         fp32_outputs, _ = tree_flatten(self.fp32_model(*self.example_input))
-        int8_outputs, _ = tree_flatten(self.int8_model(*self.example_input))
+        quant_outputs, _ = tree_flatten(self.quant_model(*self.example_input))
 
         model_error_dict = defaultdict(list)
 
-        for fp32_output, int8_output in zip(fp32_outputs, int8_outputs):
-            difference = fp32_output - int8_output
+        for fp32_output, quant_output in zip(fp32_outputs, quant_outputs):
+            difference = fp32_output - quant_output
             # Avoid divide by zero: elements where fp32 == 0 produce 0% contribution
             percentage_error = torch.where(
                 fp32_output != 0,
@@ -252,14 +252,14 @@ def __init__(
         self,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         batch_size: int,
         validation_dataset_path: str,
     ) -> None:
         super().__init__(
-            model_name, fp32_model, int8_model, example_input, tosa_output_path
+            model_name, fp32_model, quant_model, example_input, tosa_output_path
         )
 
         self.__batch_size = batch_size
@@ -279,7 +279,7 @@ def from_config(
         cls,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         config: dict[str, Any],
@@ -291,7 +291,7 @@ def from_config(
         return cls(
             model_name,
             fp32_model,
-            int8_model,
+            quant_model,
             example_input,
             tosa_output_path,
             batch_size=config["batch_size"],
@@ -302,10 +302,9 @@ def evaluate(self) -> dict[str, Any]:
         # Load dataset and compute top-1 / top-5
         dataset = MobileNetV2Evaluator.__load_dataset(self.__validation_set_path)
         top1_correct, top5_correct = GenericModelEvaluator.evaluate_topk(
-            self.int8_model, dataset, self.__batch_size, topk=5
+            self.quant_model, dataset, self.__batch_size, topk=5
         )
         output = super().evaluate()
-
         output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
         return output
 
@@ -317,14 +316,14 @@ def __init__(
         self,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         batch_size: int,
         validation_dataset_path: str,
     ) -> None:
         super().__init__(
-            model_name, fp32_model, int8_model, example_input, tosa_output_path
+            model_name, fp32_model, quant_model, example_input, tosa_output_path
         )
         self.__batch_size = batch_size
         self.__validation_set_path = validation_dataset_path
@@ -343,7 +342,7 @@ def from_config(
         cls,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         config: dict[str, Any],
@@ -355,7 +354,7 @@ def from_config(
         return cls(
             model_name,
             fp32_model,
-            int8_model,
+            quant_model,
             example_input,
             tosa_output_path,
             batch_size=config["batch_size"],
@@ -366,7 +365,7 @@ def evaluate(self) -> dict[str, Any]:
         # Load dataset and compute top-1 / top-5
         dataset = DeiTTinyEvaluator.__load_dataset(self.__validation_set_path)
         top1, top5 = GenericModelEvaluator.evaluate_topk(
-            self.int8_model, dataset, self.__batch_size, topk=5
+            self.quant_model, dataset, self.__batch_size, topk=5
         )
         output = super().evaluate()
         output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5}
@@ -380,14 +379,14 @@ def __init__(
         self,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         batch_size: int,
         validation_dataset_path: str,
     ) -> None:
         super().__init__(
-            model_name, fp32_model, int8_model, example_input, tosa_output_path
+            model_name, fp32_model, quant_model, example_input, tosa_output_path
         )
         self.__batch_size = batch_size
         self.__validation_set_path = validation_dataset_path
@@ -406,15 +405,15 @@ def from_config(
         cls,
         model_name: str,
         fp32_model: Module,
-        int8_model: Module,
+        quant_model: Module,
         example_input: Tuple[torch.Tensor],
         tosa_output_path: str | None,
         config: dict[str, Any],
     ) -> "ResNet18Evaluator":
         return cls(
             model_name,
             fp32_model,
-            int8_model,
+            quant_model,
             example_input,
             tosa_output_path,
             batch_size=config["batch_size"],
@@ -424,7 +423,7 @@ def from_config(
     def evaluate(self) -> dict[str, Any]:
         dataset = ResNet18Evaluator.__load_dataset(self.__validation_set_path)
         top1, top5 = GenericModelEvaluator.evaluate_topk(
-            self.int8_model, dataset, self.__batch_size, topk=5
+            self.quant_model, dataset, self.__batch_size, topk=5
         )
         output = super().evaluate()
         output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5}
@@ -463,8 +462,9 @@ def evaluator_calibration_data(
 def evaluate_model(
     model_name: str,
     intermediates: str,
+    target: str,
     model_fp32: torch.nn.Module,
-    model_int8: torch.nn.Module,
+    model_quant: torch.nn.Module,
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str,
     evaluator_config: str | None,
@@ -486,7 +486,7 @@ def evaluate_model(
             init_evaluator = factory(
                 model_name,
                 model_fp32,
-                model_int8,
+                model_quant,
                 example_inputs,
                 str(tosa_paths[0]),
                 config,
@@ -497,11 +497,11 @@ def evaluate_model(
             )
     else:
         init_evaluator = evaluator(
-            model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0])
+            model_name, model_fp32, model_quant, example_inputs, str(tosa_paths[0])
         )
 
     quant_metrics = init_evaluator.evaluate()
-    output_json_path = intermediates_path / "quant_metrics.json"
+    output_json_path = intermediates_path / f"{target}-quant_metrics.json"
 
     with output_json_path.open("w") as json_file:
         json.dump(quant_metrics, json_file)
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
@@ -19,7 +19,10 @@
 from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
 from executorch.backends.arm.ethosu import EthosUCompileSpec
-from executorch.backends.arm.quantizer import get_symmetric_quantization_config
+from executorch.backends.arm.quantizer import (
+    get_symmetric_a16w8_quantization_config,
+    get_symmetric_quantization_config,
+)
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
@@ -228,6 +231,7 @@ def quantize(
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str | None,
     evaluator_config: Dict[str, Any] | None,
+    is_int16x8: bool = False,
 ) -> GraphModule:
     """This is the official recommended flow for quantization in pytorch 2.0
     export.
@@ -238,7 +242,18 @@ def quantize(
 
     quantizer = create_quantizer(compile_specs)
 
-    operator_config = get_symmetric_quantization_config()
+    if is_int16x8:
+        if compile_specs.tosa_spec.support_extension("int16"):
+            operator_config = get_symmetric_a16w8_quantization_config(
+                is_per_channel=True
+            )
+        else:
+            raise ValueError(
+                f"Context TOSA spec {compile_specs.tosa_spec} doesn't support int16"
+            )
+    else:
+        operator_config = get_symmetric_quantization_config(is_per_channel=True)
+
     quantizer.set_global(operator_config)
     m = prepare_pt2e(model, quantizer)
 
@@ -356,6 +371,7 @@ def forward(self, x):
     "vgf",
     "TOSA-1.0+INT",
     "TOSA-1.0+FP",
+    "TOSA-1.0+INT+int16",
 ]
 
 
@@ -681,20 +697,23 @@ def quantize_model(
     example_inputs: Tuple[torch.Tensor],
     compile_spec,
 ) -> Tuple[GraphModule, ExportedProgram]:
-    model_int8 = quantize(
+
+    is_int16x8 = True if args.target == "TOSA-1.0+INT+int16" else False
+    model_quant = quantize(
         model,
         args.model_name,
         compile_spec,
         example_inputs,
         args.evaluate,
         args.evaluate_config,
+        is_int16x8,
     )
     # Wrap quantized model back into an exported_program
     exported_program = torch.export.export(
-        model_int8, example_inputs, strict=args.strict_export
+        model_quant, example_inputs, strict=args.strict_export
     )
 
-    return model_int8, exported_program
+    return model_quant, exported_program
 
 
 def to_edge_TOSA_delegate(
@@ -715,9 +734,9 @@ def to_edge_TOSA_delegate(
         args.enable_debug_mode,
     )
 
-    model_int8 = None
+    model_quant = None
     if args.quantize:
-        model_int8, exported_program = quantize_model(
+        model_quant, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
         )
 
@@ -731,7 +750,7 @@ def to_edge_TOSA_delegate(
         ),
     )
 
-    return model_int8, edge
+    return model_quant, edge
 
 
 def to_edge_no_delegate(
@@ -740,7 +759,7 @@ def to_edge_no_delegate(
     model: GraphModule,
     example_inputs: Tuple[torch.Tensor],
 ):
-    model_int8 = None
+    model_quant = None
     if args.quantize:
         # As we can target multiple output encodings, one must
         # be specified.
@@ -756,7 +775,7 @@ def to_edge_no_delegate(
         model, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
         )
-        model_int8 = model
+        model_quant = model
 
     edge = to_edge_transform_and_lower(
         exported_program,
@@ -765,7 +784,7 @@ def to_edge_no_delegate(
         ),
     )
 
-    return model_int8, edge
+    return model_quant, edge
 
 
 def transform_for_cortex_m_backend(edge_program_manager, args):
@@ -818,13 +837,13 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
         )
 
     # Quantize if required
-    model_int8 = None
+    model_quant = None
     if args.delegate:
-        model_int8, edge = to_edge_TOSA_delegate(
+        model_quant, edge = to_edge_TOSA_delegate(
             exported_program, args, model, example_inputs
         )
     else:
-        model_int8, edge = to_edge_no_delegate(
+        model_quant, edge = to_edge_no_delegate(
             exported_program, args, model, example_inputs
         )
 
@@ -884,7 +903,7 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
 
     if args.bundleio:
         # Realize the quantization impact on numerics when generating reference output
-        reference_model = original_model if not model_int8 else model_int8
+        reference_model = original_model if not model_quant else model_quant
         save_bpte_program(exec_prog, reference_model, output_file_name)
         print(f"Bundle PTE file saved as {output_file_name}")
     else:
@@ -895,8 +914,9 @@ def transform_for_cortex_m_backend(edge_program_manager, args):
         evaluate_model(
             args.model_name,
             args.intermediates,
+            args.target,
             model_fp32,
-            model_int8,
+            model_quant,
             example_inputs,
             args.evaluate,
             args.evaluate_config,