Skip to content

Commit 671b84f

Browse files
Add support for ppc64le quantization (#59)
This PR adds support for ppc64le (PowerPC) architecture quantization to the optimum-onnx tooling. It extends both the CLI interface and the underlying QuantizationConfig to accommodate ppc64le as a target. --------- Signed-off-by: mgiessing <[email protected]> Co-authored-by: Ilyas Moutawwakil <[email protected]>
1 parent 1aea1fe commit 671b84f

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

optimum/commands/onnxruntime/quantize.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def parse_args_onnxruntime_quantize(parser: ArgumentParser):
5555
level_group.add_argument(
5656
"--avx512_vnni", action="store_true", help="Quantization with AVX-512 and VNNI instructions."
5757
)
58+
level_group.add_argument("--ppc64le", action="store_true", help="Quantization for the PowerPC architecture.")
5859
level_group.add_argument("--tensorrt", action="store_true", help="Quantization for NVIDIA TensorRT optimizer.")
5960
level_group.add_argument(
6061
"-c",
@@ -93,6 +94,8 @@ def run(self):
9394
qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=self.args.per_channel)
9495
elif self.args.avx512_vnni:
9596
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.args.per_channel)
97+
elif self.args.ppc64le:
98+
qconfig = AutoQuantizationConfig.ppc64le(is_static=False, per_channel=self.args.per_channel)
9699
elif self.args.tensorrt:
97100
raise ValueError(
98101
"TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization"

optimum/onnxruntime/configuration.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,64 @@ def avx512_vnni(
602602
operators_to_quantize=operators_to_quantize,
603603
)
604604

605+
@staticmethod
606+
def ppc64le(
607+
is_static: bool,
608+
use_symmetric_activations: bool = False,
609+
use_symmetric_weights: bool = True,
610+
per_channel: bool = True,
611+
nodes_to_quantize: list[str] | None = None,
612+
nodes_to_exclude: list[str] | None = None,
613+
operators_to_quantize: list[str] | None = None,
614+
):
615+
"""Creates a [`~onnxruntime.QuantizationConfig`] fit for ppc64le.
616+
617+
When targeting IBM POWER10 ppc64le, the underlying execution engine leverages 8-bit outer-product instructions
618+
(e.g., xvi8ger4pp and signed/unsigned variants) to compute fused byte dot-products and accumulate into 32-bit results, i.e.,
619+
i32 += i8(w) * u8(x) at 4-way granularity per output element within a single instruction using a 512-bit MMA accumulator.
620+
621+
MMA (Matrix-Multiply Assist) is a POWER10 extension of the Power ISA and is part of the Power ISA v3.1 specification,
622+
exposed via VSX-backed 512-bit accumulators and compiler intrinsics.
623+
624+
POWER10 MMA 8-bit outer-product instructions are designed to accelerate INT8 inference on ppc64le by fusing
625+
multiply-accumulate data paths and minimizing instruction count.
626+
627+
Args:
628+
is_static (`bool`):
629+
Boolean flag to indicate whether we target static or dynamic quantization.
630+
use_symmetric_activations (`bool`, defaults to `False`):
631+
Whether to use symmetric quantization for activations.
632+
use_symmetric_weights (`bool`, defaults to `True`):
633+
Whether to use symmetric quantization for weights.
634+
per_channel (`bool`, defaults to `True`):
635+
Whether we should quantize per-channel (also known as "per-row"). Enabling this can
636+
increase overall accuracy while making the quantized model heavier.
637+
nodes_to_quantize (`Optional[List[str]]`, defaults to `None`):
638+
Specific nodes to quantize. If `None`, all nodes being operators from `operators_to_quantize` will be quantized.
639+
nodes_to_exclude (`Optional[List[str]]`, defaults to `None`):
640+
Specific nodes to exclude from quantization. The list of nodes in a model can be found loading the ONNX model through onnx.load, or through visual inspection with [netron](https://github.com/lutzroeder/netron).
641+
operators_to_quantize (`Optional[List[str]]`, defaults to `None`):
642+
Type of nodes to perform quantization on. By default, all the quantizable operators will be quantized. Quantizable operators can be found at https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/registry.py.
643+
"""
644+
format, mode, operators_to_quantize = default_quantization_parameters(
645+
is_static, operators_to_quantize=operators_to_quantize
646+
)
647+
648+
return QuantizationConfig(
649+
is_static=is_static,
650+
format=format,
651+
mode=mode,
652+
activations_dtype=QuantType.QUInt8,
653+
activations_symmetric=use_symmetric_activations,
654+
weights_dtype=QuantType.QInt8,
655+
weights_symmetric=use_symmetric_weights,
656+
per_channel=per_channel,
657+
reduce_range=False,
658+
nodes_to_quantize=nodes_to_quantize or [],
659+
nodes_to_exclude=nodes_to_exclude or [],
660+
operators_to_quantize=operators_to_quantize,
661+
)
662+
605663
@staticmethod
606664
def tensorrt(
607665
per_channel: bool = True,

0 commit comments

Comments
 (0)