Add static quantization runner

quic-hungjuiw · quic-hungjuiw · commit 9aa5a24c75d0 · 2025-04-09T03:29:38.000-07:00
- Add a general command-line tool for static quantization
- Support loading TensorQuantOverride from json file
- Add the corresponding README
- Add corresponding unittest
diff --git a/onnxruntime/python/tools/quantization/README.md b/onnxruntime/python/tools/quantization/README.md
@@ -1,2 +1,84 @@
 # Quantization Tool
-This tool can be used to quantize select ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
+This tool can be used to quantize selected ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
+
+## Static Quantization Tool
+
+### Build
+Please add `--enable_pybind` and `--build_wheel` to the build command to acquire the python tools.
+
+```bash
+cd onnxruntime
+.\build.bat --config RelWithDebInfo --build_shared_lib --parallel --cmake_generator "Visual Studio 17 2022" --enable_pybind --build_wheel
+```
+
+### Model and Data
+The static quantization tool expects the directory structure of model and data.
+
+```ps1
+work_dir\resnet18-v1-7
+├───model.onnx
+├───test_data_set_0
+├───test_data_set_1
+├───test_data_set_2
+├───test_data_set_3
+├───test_data_set_4
+├───test_data_set_5
+├───test_data_set_6
+├───test_data_set_7
+├───test_data_set_8
+└───test_data_set_9
+```
+
+### Usage
+Install the python tools built in onnxruntime
+```ps1
+cd work_dir
+python -m venv ort_env
+ort_env\Scripts\activate
+python -m pip install <path-to-built-folder>\RelWithDebInfo\RelWithDebInfo\dist\<name-of-the-wheel>.whl
+
+# The following command yields model_quant.onnx under the same directory "resnet18-v1-7"
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx
+
+work_dir\resnet18-v1-7
+├───model.onnx
+├───model_quant.onnx
+├───test_data_set_0
+│   ...
+└───test_data_set_9
+```
+
+### Quantization Arguments
+Please refer to `static_quantize_runner.py` for more detailed arguments.
+
+```ps1
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint8 --weight_type qint16
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint16 --quantize_bias
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint8 --per_channel
+```
+
+### Tensor Quant Overrides Json Format
+With `--tensor_quant_overrides`, the tool can consume the json file with quantization override information.
+```ps1
+python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --tensor_quant_overrides <path-to-json>\encoding.json
+```
+
+The tool expects the encoding.json with the format:
+```json
+{
+    "conv1_1": [
+        {
+            "scale": 0.005,
+            "zero_point": 12
+        }
+    ]
+}
+```
+- Each key is the name of a tensor in the onnx model.
+    - e.g. "conv1_1"
+- For each tensor, a list of dictionary should be provided
+    - For per-tensor quantization, the list contains a single dictionary.
+    - For per-channel quantization, the list contains a dictionary for each channel in the tensor.
+    - Each dictionary contain the information required for quantization including:
+        - scale (float)
+        - zero_point (int)
diff --git a/onnxruntime/python/tools/quantization/static_quantize_runner.py b/onnxruntime/python/tools/quantization/static_quantize_runner.py
@@ -0,0 +1,256 @@
+import argparse
+import json
+import os
+
+import numpy as np
+import onnx
+
+import onnxruntime
+from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
+from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
+
+
+class OnnxModelCalibrationDataReader(CalibrationDataReader):
+    def __init__(self, model_path):
+        self.model_dir = os.path.dirname(model_path)
+        data_dirs = [
+            os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
+        ]
+        model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
+        name2tensors = []
+        for data_dir in data_dirs:
+            name2tensor = {}
+            data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
+            data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
+            for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
+                name2tensor[model_input.name] = data_ndarray
+            name2tensors.append(name2tensor)
+        assert len(name2tensors) == len(data_dirs)
+        assert len(name2tensors[0]) == len(model_inputs)
+
+        self.calibration_data = iter(name2tensors)
+
+    def get_next(self) -> dict:
+        """generate the input data dict for ONNXinferenceSession run"""
+        return next(self.calibration_data, None)
+
+    def read_onnx_pb_data(self, file_pb):
+        tensor = onnx.TensorProto()
+        with open(file_pb, "rb") as f:
+            tensor.ParseFromString(f.read())
+        ret = onnx.numpy_helper.to_array(tensor)
+        return ret
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="The arguments for static quantization")
+    parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
+    parser.add_argument(
+        "-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
+    )
+    parser.add_argument(
+        "--activation_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="quint8",
+        help="Activation quantization type used",
+    )
+    parser.add_argument(
+        "--weight_type",
+        choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
+        default="qint8",
+        help="Weight quantization type used",
+    )
+    parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
+    parser.add_argument(
+        "--force_quantize_no_input_check",
+        action="store_true",
+        help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
+        " quantized already. Setting to True to force such operator always quantize input and so generate"
+        " quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
+    )
+    parser.add_argument(
+        "--matmul_const_b_only",
+        action="store_true",
+        help="If set, only MatMul with const B will be quantized.",
+    )
+    parser.add_argument(
+        "--add_qdq_pair_to_weight",
+        action="store_true",
+        help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
+        " nodes to weight.",
+    )
+    parser.add_argument(
+        "--dedicated_qdq_pair",
+        action="store_true",
+        help="If set, it will create identical and dedicated QDQ pair for each node.",
+    )
+    parser.add_argument(
+        "--op_types_to_exclude_output_quantization",
+        nargs="+",
+        default=[],
+        help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
+    )
+    parser.add_argument(
+        "--calibration_method",
+        default="minmax",
+        choices=["minmax", "entropy", "percentile", "distribution"],
+        help="Calibration method used",
+    )
+    parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
+    parser.add_argument(
+        "--calib_tensor_range_symmetric",
+        action="store_true",
+        help="If enabled, the final range of tensor during calibration will be explicitly"
+        " set to symmetric to central point 0",
+    )
+    # TODO: --calib_strided_minmax"
+    # TODO: --calib_moving_average_constant"
+    # TODO: --calib_max_intermediate_outputs"
+    parser.add_argument(
+        "--calib_moving_average",
+        action="store_true",
+        help="If enabled, the moving average of"
+        " the minimum and maximum values will be computed when the calibration method selected is MinMax.",
+    )
+    parser.add_argument(
+        "--disable_quantize_bias",
+        action="store_true",
+        help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
+        " If not set, it remains floating-point bias and does not insert any quantization nodes"
+        " associated with biases.",
+    )
+
+    # TODO: Add arguments related to Smooth Quant
+
+    parser.add_argument(
+        "--use_qdq_contrib_ops",
+        action="store_true",
+        help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
+        " which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
+    )
+    parser.add_argument(
+        "--minimum_real_range",
+        type=float,
+        default=0.0001,
+        help="If set to a floating-point value, the calculation of the quantization parameters"
+        " (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
+        " is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
+        " necessary for EPs like QNN that require a minimum floating-point range when determining "
+        " quantization parameters.",
+    )
+    parser.add_argument(
+        "--qdq_keep_removable_activations",
+        action="store_true",
+        help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
+        " and will be explicitly represented in the QDQ model.",
+    )
+    parser.add_argument(
+        "--qdq_disable_weight_adjust_for_int32_bias",
+        action="store_true",
+        help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
+        " has a scale (input_scale * weight_scale) that is too small.",
+    )
+    parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
+    parser.add_argument(
+        "--nodes_to_quantize",
+        nargs="+",
+        default=None,
+        help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
+    )
+    parser.add_argument(
+        "--nodes_to_exclude",
+        nargs="+",
+        default=None,
+        help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
+    )
+    parser.add_argument(
+        "--op_per_channel_axis",
+        nargs=2,
+        action="append",
+        metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
+        default=[],
+        help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
+        " effective only when per channel quantization is supported and per_channel is True. If specific"
+        " op type supports per channel quantization but not explicitly specified with channel axis,"
+        " default channel axis will be used.",
+    )
+    parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
+    return parser.parse_args()
+
+
+def get_tensor_quant_overrides(file):
+    # TODO: Enhance the function to handle more real cases of json file
+    if not file:
+        return {}
+    with open(file) as f:
+        quant_override_dict = json.load(f)
+    for tensor in quant_override_dict:
+        for enc_dict in quant_override_dict[tensor]:
+            enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
+            enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
+    return quant_override_dict
+
+
+def main():
+    args = parse_arguments()
+    data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
+    arg2quant_type = {
+        "qint8": QuantType.QInt8,
+        "quint8": QuantType.QUInt8,
+        "qint16": QuantType.QInt16,
+        "quint16": QuantType.QUInt16,
+        "qint4": QuantType.QInt4,
+        "quint4": QuantType.QUInt4,
+        "qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
+    }
+    activation_type = arg2quant_type[args.activation_type]
+    weight_type = arg2quant_type[args.weight_type]
+    qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
+    extra_options = {
+        "EnableSubgraph": args.enable_subgraph,
+        "ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
+        "MatMulConstBOnly": args.matmul_const_b_only,
+        "AddQDQPairToWeight": args.add_qdq_pair_to_weight,
+        "OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
+        "DedicatedQDQPair": args.dedicated_qdq_pair,
+        "QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
+        "CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
+        "CalibMovingAverage": args.calib_moving_average,
+        "QuantizeBias": not args.disable_quantize_bias,
+        "UseQDQContribOps": args.use_qdq_contrib_ops,
+        "MinimumRealRange": args.minimum_real_range,
+        "QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
+        "QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
+        # Load json file for encoding override
+        "TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
+    }
+    arg2calib_method = {
+        "minmax": CalibrationMethod.MinMax,
+        "entropy": CalibrationMethod.Entropy,
+        "percentile": CalibrationMethod.Percentile,
+        "distribution": CalibrationMethod.Distribution,
+    }
+    arg2quant_format = {
+        "qdq": QuantFormat.QDQ,
+        "qoperator": QuantFormat.QOperator,
+    }
+    sqc = StaticQuantConfig(
+        calibration_data_reader=data_reader,
+        calibrate_method=arg2calib_method[args.calibration_method],
+        quant_format=arg2quant_format[args.quant_format],
+        activation_type=activation_type,
+        weight_type=weight_type,
+        op_types_to_quantize=None,
+        nodes_to_quantize=args.nodes_to_quantize,
+        nodes_to_exclude=args.nodes_to_exclude,
+        per_channel=args.per_channel,
+        reduce_range=False,
+        use_external_data_format=False,
+        calibration_providers=None,  # Use CPUExecutionProvider
+        extra_options=extra_options,
+    )
+    quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/python/quantization/test_static_quantize_runner.py b/onnxruntime/test/python/quantization/test_static_quantize_runner.py