Skip to content

Commit 9aa5a24

Browse files
committed
Add static quantization runner
- Add a general command-line tool for static quantization - Support loading TensorQuantOverride from json file - Add the corresponding README - Add corresponding unittest
1 parent afaf4a5 commit 9aa5a24

File tree

3 files changed

+907
-1
lines changed

3 files changed

+907
-1
lines changed
Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,84 @@
11
# Quantization Tool
2-
This tool can be used to quantize select ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
2+
This tool can be used to quantize selected ONNX models. Support is based on operators in the model. Please refer to https://onnxruntime.ai/docs/performance/quantization.html for usage details and https://github.com/microsoft/onnxruntime-inference-examples/tree/main/quantization for examples.
3+
4+
## Static Quantization Tool
5+
6+
### Build
7+
Please add `--enable_pybind` and `--build_wheel` to the build command to acquire the python tools.
8+
9+
```bash
10+
cd onnxruntime
11+
.\build.bat --config RelWithDebInfo --build_shared_lib --parallel --cmake_generator "Visual Studio 17 2022" --enable_pybind --build_wheel
12+
```
13+
14+
### Model and Data
15+
The static quantization tool expects the directory structure of model and data.
16+
17+
```ps1
18+
work_dir\resnet18-v1-7
19+
├───model.onnx
20+
├───test_data_set_0
21+
├───test_data_set_1
22+
├───test_data_set_2
23+
├───test_data_set_3
24+
├───test_data_set_4
25+
├───test_data_set_5
26+
├───test_data_set_6
27+
├───test_data_set_7
28+
├───test_data_set_8
29+
└───test_data_set_9
30+
```
31+
32+
### Usage
33+
Install the python tools built in onnxruntime
34+
```ps1
35+
cd work_dir
36+
python -m venv ort_env
37+
ort_env\Scripts\activate
38+
python -m pip install <path-to-built-folder>\RelWithDebInfo\RelWithDebInfo\dist\<name-of-the-wheel>.whl
39+
40+
# The following command yields model_quant.onnx under the same directory "resnet18-v1-7"
41+
python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx
42+
43+
work_dir\resnet18-v1-7
44+
├───model.onnx
45+
├───model_quant.onnx
46+
├───test_data_set_0
47+
│ ...
48+
└───test_data_set_9
49+
```
50+
51+
### Quantization Arguments
52+
Please refer to `static_quantize_runner.py` for more detailed arguments.
53+
54+
```ps1
55+
python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint8 --weight_type qint16
56+
python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint16 --quantize_bias
57+
python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --activation_type qint16 --weight_type qint8 --per_channel
58+
```
59+
60+
### Tensor Quant Overrides Json Format
61+
With `--tensor_quant_overrides`, the tool can consume the json file with quantization override information.
62+
```ps1
63+
python -m onnxruntime.quantization.static_quantize_runner -i resnet18-v1-7\model.onnx -o resnet18-v1-7\model_quant.onnx --tensor_quant_overrides <path-to-json>\encoding.json
64+
```
65+
66+
The tool expects the encoding.json with the format:
67+
```json
68+
{
69+
"conv1_1": [
70+
{
71+
"scale": 0.005,
72+
"zero_point": 12
73+
}
74+
]
75+
}
76+
```
77+
- Each key is the name of a tensor in the onnx model.
78+
- e.g. "conv1_1"
79+
- For each tensor, a list of dictionary should be provided
80+
- For per-tensor quantization, the list contains a single dictionary.
81+
- For per-channel quantization, the list contains a dictionary for each channel in the tensor.
82+
- Each dictionary contain the information required for quantization including:
83+
- scale (float)
84+
- zero_point (int)
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
import argparse
2+
import json
3+
import os
4+
5+
import numpy as np
6+
import onnx
7+
8+
import onnxruntime
9+
from onnxruntime.quantization import QuantFormat, QuantType, StaticQuantConfig, quantize
10+
from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod
11+
12+
13+
class OnnxModelCalibrationDataReader(CalibrationDataReader):
14+
def __init__(self, model_path):
15+
self.model_dir = os.path.dirname(model_path)
16+
data_dirs = [
17+
os.path.join(self.model_dir, a) for a in os.listdir(self.model_dir) if a.startswith("test_data_set_")
18+
]
19+
model_inputs = onnxruntime.InferenceSession(model_path).get_inputs()
20+
name2tensors = []
21+
for data_dir in data_dirs:
22+
name2tensor = {}
23+
data_paths = [os.path.join(data_dir, a) for a in sorted(os.listdir(data_dir))]
24+
data_ndarrays = [self.read_onnx_pb_data(data_path) for data_path in data_paths]
25+
for model_input, data_ndarray in zip(model_inputs, data_ndarrays, strict=False):
26+
name2tensor[model_input.name] = data_ndarray
27+
name2tensors.append(name2tensor)
28+
assert len(name2tensors) == len(data_dirs)
29+
assert len(name2tensors[0]) == len(model_inputs)
30+
31+
self.calibration_data = iter(name2tensors)
32+
33+
def get_next(self) -> dict:
34+
"""generate the input data dict for ONNXinferenceSession run"""
35+
return next(self.calibration_data, None)
36+
37+
def read_onnx_pb_data(self, file_pb):
38+
tensor = onnx.TensorProto()
39+
with open(file_pb, "rb") as f:
40+
tensor.ParseFromString(f.read())
41+
ret = onnx.numpy_helper.to_array(tensor)
42+
return ret
43+
44+
45+
def parse_arguments():
46+
parser = argparse.ArgumentParser(description="The arguments for static quantization")
47+
parser.add_argument("-i", "--input_model_path", required=True, help="Path to the input onnx model")
48+
parser.add_argument(
49+
"-o", "--output_quantized_model_path", required=True, help="Path to the output quantized onnx model"
50+
)
51+
parser.add_argument(
52+
"--activation_type",
53+
choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
54+
default="quint8",
55+
help="Activation quantization type used",
56+
)
57+
parser.add_argument(
58+
"--weight_type",
59+
choices=["qint8", "quint8", "qint16", "quint16", "qint4", "quint4", "qfloat8e4m3fn"],
60+
default="qint8",
61+
help="Weight quantization type used",
62+
)
63+
parser.add_argument("--enable_subgraph", action="store_true", help="If set, subgraph will be quantized.")
64+
parser.add_argument(
65+
"--force_quantize_no_input_check",
66+
action="store_true",
67+
help="By default, some latent operators like maxpool, transpose, do not quantize if their input is not"
68+
" quantized already. Setting to True to force such operator always quantize input and so generate"
69+
" quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.",
70+
)
71+
parser.add_argument(
72+
"--matmul_const_b_only",
73+
action="store_true",
74+
help="If set, only MatMul with const B will be quantized.",
75+
)
76+
parser.add_argument(
77+
"--add_qdq_pair_to_weight",
78+
action="store_true",
79+
help="If set, it remains floating-point weight and inserts both QuantizeLinear/DeQuantizeLinear"
80+
" nodes to weight.",
81+
)
82+
parser.add_argument(
83+
"--dedicated_qdq_pair",
84+
action="store_true",
85+
help="If set, it will create identical and dedicated QDQ pair for each node.",
86+
)
87+
parser.add_argument(
88+
"--op_types_to_exclude_output_quantization",
89+
nargs="+",
90+
default=[],
91+
help="If any op type is specified, it won't quantize the output of ops with this specific op types.",
92+
)
93+
parser.add_argument(
94+
"--calibration_method",
95+
default="minmax",
96+
choices=["minmax", "entropy", "percentile", "distribution"],
97+
help="Calibration method used",
98+
)
99+
parser.add_argument("--quant_format", default="qdq", choices=["qdq", "qoperator"], help="Quantization format used")
100+
parser.add_argument(
101+
"--calib_tensor_range_symmetric",
102+
action="store_true",
103+
help="If enabled, the final range of tensor during calibration will be explicitly"
104+
" set to symmetric to central point 0",
105+
)
106+
# TODO: --calib_strided_minmax"
107+
# TODO: --calib_moving_average_constant"
108+
# TODO: --calib_max_intermediate_outputs"
109+
parser.add_argument(
110+
"--calib_moving_average",
111+
action="store_true",
112+
help="If enabled, the moving average of"
113+
" the minimum and maximum values will be computed when the calibration method selected is MinMax.",
114+
)
115+
parser.add_argument(
116+
"--disable_quantize_bias",
117+
action="store_true",
118+
help="Whether to quantize floating-point biases by solely inserting a DeQuantizeLinear node"
119+
" If not set, it remains floating-point bias and does not insert any quantization nodes"
120+
" associated with biases.",
121+
)
122+
123+
# TODO: Add arguments related to Smooth Quant
124+
125+
parser.add_argument(
126+
"--use_qdq_contrib_ops",
127+
action="store_true",
128+
help="If set, the inserted QuantizeLinear and DequantizeLinear ops will have the com.microsoft domain,"
129+
" which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear contrib op implementations.",
130+
)
131+
parser.add_argument(
132+
"--minimum_real_range",
133+
type=float,
134+
default=0.0001,
135+
help="If set to a floating-point value, the calculation of the quantization parameters"
136+
" (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)"
137+
" is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is"
138+
" necessary for EPs like QNN that require a minimum floating-point range when determining "
139+
" quantization parameters.",
140+
)
141+
parser.add_argument(
142+
"--qdq_keep_removable_activations",
143+
action="store_true",
144+
help="If set, removable activations (e.g., Clip or Relu) will not be removed,"
145+
" and will be explicitly represented in the QDQ model.",
146+
)
147+
parser.add_argument(
148+
"--qdq_disable_weight_adjust_for_int32_bias",
149+
action="store_true",
150+
help="If set, QDQ quantizer will not adjust the weight's scale when the bias"
151+
" has a scale (input_scale * weight_scale) that is too small.",
152+
)
153+
parser.add_argument("--per_channel", action="store_true", help="Whether using per-channel quantization")
154+
parser.add_argument(
155+
"--nodes_to_quantize",
156+
nargs="+",
157+
default=None,
158+
help="List of nodes names to quantize. When this list is not None only the nodes in this list are quantized.",
159+
)
160+
parser.add_argument(
161+
"--nodes_to_exclude",
162+
nargs="+",
163+
default=None,
164+
help="List of nodes names to exclude. The nodes in this list will be excluded from quantization when it is not None.",
165+
)
166+
parser.add_argument(
167+
"--op_per_channel_axis",
168+
nargs=2,
169+
action="append",
170+
metavar=("OP_TYPE", "PER_CHANNEL_AXIS"),
171+
default=[],
172+
help="Set channel axis for specific op type, for example: --op_per_channel_axis MatMul 1, and it's"
173+
" effective only when per channel quantization is supported and per_channel is True. If specific"
174+
" op type supports per channel quantization but not explicitly specified with channel axis,"
175+
" default channel axis will be used.",
176+
)
177+
parser.add_argument("--tensor_quant_overrides", help="Set the json file for tensor quantization overrides.")
178+
return parser.parse_args()
179+
180+
181+
def get_tensor_quant_overrides(file):
182+
# TODO: Enhance the function to handle more real cases of json file
183+
if not file:
184+
return {}
185+
with open(file) as f:
186+
quant_override_dict = json.load(f)
187+
for tensor in quant_override_dict:
188+
for enc_dict in quant_override_dict[tensor]:
189+
enc_dict["scale"] = np.array(enc_dict["scale"], dtype=np.float32)
190+
enc_dict["zero_point"] = np.array(enc_dict["zero_point"])
191+
return quant_override_dict
192+
193+
194+
def main():
195+
args = parse_arguments()
196+
data_reader = OnnxModelCalibrationDataReader(model_path=args.input_model_path)
197+
arg2quant_type = {
198+
"qint8": QuantType.QInt8,
199+
"quint8": QuantType.QUInt8,
200+
"qint16": QuantType.QInt16,
201+
"quint16": QuantType.QUInt16,
202+
"qint4": QuantType.QInt4,
203+
"quint4": QuantType.QUInt4,
204+
"qfloat8e4m3fn": QuantType.QFLOAT8E4M3FN,
205+
}
206+
activation_type = arg2quant_type[args.activation_type]
207+
weight_type = arg2quant_type[args.weight_type]
208+
qdq_op_type_per_channel_support_to_axis = dict(args.op_per_channel_axis)
209+
extra_options = {
210+
"EnableSubgraph": args.enable_subgraph,
211+
"ForceQuantizeNoInputCheck": args.force_quantize_no_input_check,
212+
"MatMulConstBOnly": args.matmul_const_b_only,
213+
"AddQDQPairToWeight": args.add_qdq_pair_to_weight,
214+
"OpTypesToExcludeOutputQuantization": args.op_types_to_exclude_output_quantization,
215+
"DedicatedQDQPair": args.dedicated_qdq_pair,
216+
"QDQOpTypePerChannelSupportToAxis": qdq_op_type_per_channel_support_to_axis,
217+
"CalibTensorRangeSymmetric": args.calib_tensor_range_symmetric,
218+
"CalibMovingAverage": args.calib_moving_average,
219+
"QuantizeBias": not args.disable_quantize_bias,
220+
"UseQDQContribOps": args.use_qdq_contrib_ops,
221+
"MinimumRealRange": args.minimum_real_range,
222+
"QDQKeepRemovableActivations": args.qdq_keep_removable_activations,
223+
"QDQDisableWeightAdjustForInt32Bias": args.qdq_disable_weight_adjust_for_int32_bias,
224+
# Load json file for encoding override
225+
"TensorQuantOverrides": get_tensor_quant_overrides(args.tensor_quant_overrides),
226+
}
227+
arg2calib_method = {
228+
"minmax": CalibrationMethod.MinMax,
229+
"entropy": CalibrationMethod.Entropy,
230+
"percentile": CalibrationMethod.Percentile,
231+
"distribution": CalibrationMethod.Distribution,
232+
}
233+
arg2quant_format = {
234+
"qdq": QuantFormat.QDQ,
235+
"qoperator": QuantFormat.QOperator,
236+
}
237+
sqc = StaticQuantConfig(
238+
calibration_data_reader=data_reader,
239+
calibrate_method=arg2calib_method[args.calibration_method],
240+
quant_format=arg2quant_format[args.quant_format],
241+
activation_type=activation_type,
242+
weight_type=weight_type,
243+
op_types_to_quantize=None,
244+
nodes_to_quantize=args.nodes_to_quantize,
245+
nodes_to_exclude=args.nodes_to_exclude,
246+
per_channel=args.per_channel,
247+
reduce_range=False,
248+
use_external_data_format=False,
249+
calibration_providers=None, # Use CPUExecutionProvider
250+
extra_options=extra_options,
251+
)
252+
quantize(model_input=args.input_model_path, model_output=args.output_quantized_model_path, quant_config=sqc)
253+
254+
255+
if __name__ == "__main__":
256+
main()

0 commit comments

Comments
 (0)