Add support for ppc64le quantization (#59)

mgiessing · IlyasMoutawwakil · web-flow · commit 671b84f78a24 · 2025-10-01T15:52:10.000+02:00
This PR adds support for ppc64le (PowerPC) architecture quantization to
the optimum-onnx tooling.
It extends both the CLI interface and the underlying QuantizationConfig
to accommodate ppc64le as a target.

---------

Signed-off-by: mgiessing &lt;marvin.giessing@gmail.com&gt;
Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/commands/onnxruntime/quantize.py
@@ -55,6 +55,7 @@ def parse_args_onnxruntime_quantize(parser: ArgumentParser):
     level_group.add_argument(
         "--avx512_vnni", action="store_true", help="Quantization with AVX-512 and VNNI instructions."
     )
+    level_group.add_argument("--ppc64le", action="store_true", help="Quantization for the PowerPC architecture.")
     level_group.add_argument("--tensorrt", action="store_true", help="Quantization for NVIDIA TensorRT optimizer.")
     level_group.add_argument(
         "-c",
@@ -93,6 +94,8 @@ def run(self):
             qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=self.args.per_channel)
         elif self.args.avx512_vnni:
             qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.args.per_channel)
+        elif self.args.ppc64le:
+            qconfig = AutoQuantizationConfig.ppc64le(is_static=False, per_channel=self.args.per_channel)
         elif self.args.tensorrt:
             raise ValueError(
                 "TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization"
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
@@ -602,6 +602,64 @@ def avx512_vnni(
             operators_to_quantize=operators_to_quantize,
         )
 
+    @staticmethod
+    def ppc64le(
+        is_static: bool,
+        use_symmetric_activations: bool = False,
+        use_symmetric_weights: bool = True,
+        per_channel: bool = True,
+        nodes_to_quantize: list[str] | None = None,
+        nodes_to_exclude: list[str] | None = None,
+        operators_to_quantize: list[str] | None = None,
+    ):
+        """Creates a [`~onnxruntime.QuantizationConfig`] fit for ppc64le.
+
+        When targeting IBM POWER10 ppc64le, the underlying execution engine leverages 8-bit outer-product instructions
+        (e.g., xvi8ger4pp and signed/unsigned variants) to compute fused byte dot-products and accumulate into 32-bit results, i.e.,
+        i32 += i8(w) * u8(x) at 4-way granularity per output element within a single instruction using a 512-bit MMA accumulator.
+
+        MMA (Matrix-Multiply Assist) is a POWER10 extension of the Power ISA and is part of the Power ISA v3.1 specification,
+        exposed via VSX-backed 512-bit accumulators and compiler intrinsics.
+
+        POWER10 MMA 8-bit outer-product instructions are designed to accelerate INT8 inference on ppc64le by fusing
+        multiply-accumulate data paths and minimizing instruction count.
+
+        Args:
+            is_static (`bool`):
+                Boolean flag to indicate whether we target static or dynamic quantization.
+            use_symmetric_activations (`bool`, defaults to `False`):
+                Whether to use symmetric quantization for activations.
+            use_symmetric_weights (`bool`, defaults to `True`):
+                Whether to use symmetric quantization for weights.
+            per_channel (`bool`, defaults to `True`):
+                Whether we should quantize per-channel (also known as "per-row"). Enabling this can
+                increase overall accuracy while making the quantized model heavier.
+            nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
+                Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
+            nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
+                Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
+            operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
+                Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
+        """
+        format, mode, operators_to_quantize = default_quantization_parameters(
+            is_static, operators_to_quantize=operators_to_quantize
+        )
+
+        return QuantizationConfig(
+            is_static=is_static,
+            format=format,
+            mode=mode,
+            activations_dtype=QuantType.QUInt8,
+            activations_symmetric=use_symmetric_activations,
+            weights_dtype=QuantType.QInt8,
+            weights_symmetric=use_symmetric_weights,
+            per_channel=per_channel,
+            reduce_range=False,
+            nodes_to_quantize=nodes_to_quantize or [],
+            nodes_to_exclude=nodes_to_exclude or [],
+            operators_to_quantize=operators_to_quantize,
+        )
+
     @staticmethod
     def tensorrt(
         per_channel: bool = True,