diff --git a/tools/quantize/README.md b/tools/quantize/README.md index 3547de37..54a931f0 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -7,7 +7,7 @@ Install dependencies before trying quantization: pip install -r requirements.txt ``` -## Usage +## Quantization Usage Quantize all models in the Zoo: ```shell @@ -52,6 +52,16 @@ models = dict( python quantize-inc.py model1 ``` +## Blockwise quantization usage + +`block_quantize.py` requires Python>=3.7 + +To perform weight-only blockwise quantization: + +```shell +python block_quantize.py --input_model INPUT_MODEL.onnx --output_model OUTPUT_MODEL.onnx --block_size {block size} --bits {8,16} +``` + ## Dataset Some models are quantized with extra datasets. - [MP-PalmDet](../../models/palm_detection_mediapipe) and [MP-HandPose](../../models/handpose_estimation_mediapipe) are quantized with evaluation set of [FreiHAND](https://lmb.informatik.uni-freiburg.de/resources/datasets/FreihandDataset.en.html). Download the dataset from [this link](https://lmb.informatik.uni-freiburg.de/data/freihand/FreiHAND_pub_v2_eval.zip). Unpack it and replace `path/to/dataset` with the path to `FreiHAND_pub_v2_eval/evaluation/rgb`. diff --git a/tools/quantize/block_quantize.py b/tools/quantize/block_quantize.py new file mode 100644 index 00000000..6d49614d --- /dev/null +++ b/tools/quantize/block_quantize.py @@ -0,0 +1,419 @@ +import sys + +MIN_PYTHON_VERSION = (3, 7) + +if sys.version_info < MIN_PYTHON_VERSION: + raise ImportError("This script requires Python 3.7 or higher!") + +import argparse +import os +from dataclasses import dataclass, field +from typing import List, Optional, Tuple + +import numpy as np +import onnx +from onnx import helper + +BITS_TO_NUMPY_TYPE = {8: np.uint8, 16: np.uint16} + + +SUPPORTED_OPS = { + "Conv" +} + +ONNX_OPSET = 21 + + +@dataclass +class BlockQuantizeConfig: + input_model_path: str + output_model_path: str + block_size: int + bits: int + + +@dataclass +class BlockQuantizeResult: + quantized_weights: np.ndarray = field(default_factory=lambda: np.array([])) + scales: np.ndarray = field(default_factory=lambda: np.array([])) + zero_point: np.ndarray = field(default_factory=lambda: np.array([])) + block_size: int = 1 + axis: int = 1 + original_shape: Tuple = field(default_factory=tuple) + quantization_error: np.ndarray = field(default_factory=lambda: np.array([])) + + +@dataclass +class LayerParams: + weights: np.ndarray = field(default_factory=lambda: np.array([])) + bias: Optional[np.ndarray] = None + + +def closest_divisor(number: int, divisor: int) -> int: + for d in range(divisor, 0, -1): + if number % d == 0: + return d + return 1 + + +def block_dequantize_tensor( + x: np.ndarray, block_axis: int, scale: np.ndarray, zero_point: np.ndarray +) -> np.ndarray: + repeats = x.shape[block_axis] // scale.shape[block_axis] + + x_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis) + x_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis) + + y = ( + x.astype(np.float32) - x_zero_point_elementwise.astype(np.float32) + ) * x_scale_elementwise + + return y + + +def block_quantize_tensor( + x: np.ndarray, + block_axis: int, + scale: np.ndarray, + zero_point: np.ndarray, + n_bits: int, +) -> np.ndarray: + repeats = x.shape[block_axis] // scale.shape[block_axis] + + y_scale_elementwise = np.repeat(scale, repeats=repeats, axis=block_axis) + y_zero_point_elementwise = np.repeat(zero_point, repeats=repeats, axis=block_axis) + + y = np.rint(x / y_scale_elementwise + y_zero_point_elementwise).astype( + BITS_TO_NUMPY_TYPE[n_bits] + ) + + return y + + +def create_dequantize_node( + node_name, + quantized_weights, + scales, + zero_point, + dequantized_weights, + block_size, + axis, +) -> onnx.NodeProto: + block_size_attr = helper.make_attribute("block_size", block_size) + axis_attr = helper.make_attribute("axis", axis) + + n = helper.make_node( + "DequantizeLinear", + inputs=[quantized_weights, scales, zero_point], + outputs=[dequantized_weights], + name=node_name, + ) + n.attribute.extend([block_size_attr, axis_attr]) + return n + + +def create_reshape_node( + node_name, dequantized_weights, shape_tensor, reshaped_weights_name +) -> onnx.NodeProto: + return helper.make_node( + "Reshape", + inputs=[dequantized_weights, shape_tensor], + outputs=[reshaped_weights_name], + name=node_name, + ) + + +class BlockQuantizer: + def __init__(self, conf: BlockQuantizeConfig) -> None: + self.conf = conf + self.validate_conf() + + self.model = onnx.load(conf.input_model_path) + + if self.model.opset_import[0].version != ONNX_OPSET: + self.model = onnx.version_converter.convert_version(self.model, ONNX_OPSET) + + self.graph = self.model.graph + self.initializers_map = { + init.name: init for init in self.model.graph.initializer + } + + def validate_conf(self): + if not os.path.isfile(self.conf.input_model_path): + raise ValueError( + f"Input model path '{self.conf.input_model_path}' does not exist or is not a file." + ) + + if not self.conf.input_model_path.lower().endswith(".onnx"): + raise ValueError( + f"Input model path '{self.conf.input_model_path}' must have a .onnx extension." + ) + + if not self.conf.output_model_path.lower().endswith(".onnx"): + raise ValueError( + f"Output model path '{self.conf.output_model_path}' must have a .onnx extension." + ) + + if self.conf.block_size <= 0: + raise ValueError("Block size must be a positive integer.") + + if self.conf.bits not in BITS_TO_NUMPY_TYPE: + allowed_values = ", ".join([str(k) for k in BITS_TO_NUMPY_TYPE.keys()]) + raise ValueError( + f"Bits must be one of the following values: [{allowed_values}]." + ) + + def get_initializer_tensor(self, name: str) -> Optional[np.ndarray]: + if name in self.initializers_map: + return onnx.numpy_helper.to_array(self.initializers_map[name]) + + return None + + def get_layer_params(self, node: onnx.NodeProto) -> LayerParams: + params = LayerParams() + + weights_name = node.input[1] + params.weights = self.get_initializer_tensor(weights_name) + + if len(node.input) > 2: + bias_name = node.input[2] + params.bias = self.get_initializer_tensor(bias_name) + + return params + + def compute_scale_zeropoint( + self, b_min: np.ndarray, b_max: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + assert ( + b_min < b_max + ).all(), ( + "minimum must be lower than maximum when computing scale and zero point" + ) + + # zero must be present in the range, this enforces qmin <= zero_point <= qmax + b_min = np.minimum(b_min, np.zeros_like(b_min, dtype=b_min.dtype)) + b_max = np.maximum(b_max, np.zeros_like(b_max, dtype=b_max.dtype)) + + qmin = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).min + qmax = np.iinfo(BITS_TO_NUMPY_TYPE[self.conf.bits]).max + + dq = qmax - qmin + + scales = (b_max - b_min) / dq + zeropoints = np.rint(qmin - b_min / scales).astype( + BITS_TO_NUMPY_TYPE[self.conf.bits] + ) + + return (scales, zeropoints) + + def block_quantize(self, weight: np.ndarray) -> BlockQuantizeResult: + original_shape = weight.shape + weight = weight.reshape((weight.shape[0], -1)) + + quantization_axis = 1 + + block_size = closest_divisor(weight.shape[1], self.conf.block_size) + + assert ( + weight.shape[1] % block_size == 0 + ), f"weight shape ({weight.shape[1]}) must be divisible by block size ({block_size})" + + # Warning, axis = 1 specific instruction! + blocked_weight = weight.reshape( + (weight.shape[0], weight.shape[1] // block_size, -1) + ) + + # Warning, axis = 1 specific instruction! + blocked_max = np.max(blocked_weight, -1) + # Warning, axis = 1 specific instruction! + blocked_min = np.min(blocked_weight, -1) + + scales, zeropoints = self.compute_scale_zeropoint(blocked_min, blocked_max) + + quantized_weight = block_quantize_tensor( + weight, quantization_axis, scales, zeropoints, self.conf.bits + ) + reconstructed_mat = block_dequantize_tensor( + quantized_weight, quantization_axis, scales, zeropoints + ) + + qerror = np.linalg.norm(reconstructed_mat - weight) + + res = BlockQuantizeResult( + quantized_weight, + scales, + zeropoints, + block_size, + quantization_axis, + original_shape, + qerror, + ) + + return res + + def get_model_size(self, model_path: str) -> float: + size_bytes = os.path.getsize(model_path) + size_mb = size_bytes / 1024 + + return size_mb + + def display_summary(self, sqe: List): + mse = sum(sqe) / len(sqe) + original_model_size = self.get_model_size(self.conf.input_model_path) + quantized_model_size = self.get_model_size(self.conf.output_model_path) + + print("Done! Results saved in", self.conf.output_model_path) + print("\nSummary of Results:\n") + print(f"{'Metric':<30} {'Value':<10}") + print(f"{'-'*40}") + print(f"{'Mean Squared Quantization Error':<30} {mse:.6f}") + print(f"{'Original Model Size (KB)':<31} {original_model_size:,.2f}") + print(f"{'Block-Quantized Model Size (KB)':<30} {quantized_model_size:,.2f}") + + def run(self): + print("Quantizing the model...") + + visited_nodes = [] + sqe = [] + + for node in self.model.graph.node: + if node.name in visited_nodes: + continue + if node.op_type in SUPPORTED_OPS: + conv_params = self.get_layer_params(node) + block_quantize_res = self.block_quantize(conv_params.weights) + + quantized_weights_name = f"{node.name}_quantized_weights" + quantized_node_name = f"{node.name}_quantized_node" + dequantized_weights_name = f"{node.name}_dequantized_weights" + scales_name = f"{node.name}_scales" + zero_point_name = f"{node.name}_zero_point" + + shape_node_name = f"{node.name}_shape_node" + shape_name = f"{node.name}_shape" + reshaped_weights_name = f"{node.name}_reshaped_weights" + + dequantize_node = create_dequantize_node( + quantized_node_name, + quantized_weights_name, + scales_name, + zero_point_name, + dequantized_weights_name, + block_quantize_res.block_size, + block_quantize_res.axis, + ) + reshape_node = create_reshape_node( + shape_node_name, + dequantized_weights_name, + shape_name, + reshaped_weights_name, + ) + + shape_tensor = onnx.numpy_helper.from_array( + np.array(block_quantize_res.original_shape), name=shape_name + ) + scale_initializer = onnx.numpy_helper.from_array( + block_quantize_res.scales, name=scales_name + ) + zero_point_initializer = onnx.numpy_helper.from_array( + block_quantize_res.zero_point, name=zero_point_name + ) + quantized_weights_initializer = onnx.numpy_helper.from_array( + block_quantize_res.quantized_weights, name=quantized_weights_name + ) + + dequantized_weights_info = helper.make_tensor_value_info( + dequantized_weights_name, + onnx.TensorProto.FLOAT, + block_quantize_res.quantized_weights.shape, + ) + shape_info = helper.make_tensor_value_info( + reshaped_weights_name, + onnx.TensorProto.FLOAT, + block_quantize_res.original_shape, + ) + + self.graph.initializer.extend( + [ + scale_initializer, + zero_point_initializer, + shape_tensor, + quantized_weights_initializer, + ] + ) + + # Removing fp32 weights + self.graph.initializer.remove( + next( + init + for init in self.graph.initializer + if init.name == node.input[1] + ) + ) + node.input[1] = reshaped_weights_name + + # Preserving the topological order of graph nodes + self.graph.node.insert(0, reshape_node) + self.graph.node.insert(0, dequantize_node) + self.graph.value_info.insert(0, shape_info) + self.graph.value_info.insert(0, dequantized_weights_info) + + sqe.append(block_quantize_res.quantization_error**2) + visited_nodes.append(node.name) + + onnx.checker.check_model(self.model, full_check=True) + onnx.save(self.model, self.conf.output_model_path) + + self.display_summary(sqe) + + +def setup_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Blockwise quantization tool") + + parser.add_argument( + "-i", + "--input_model", + type=str, + help="The path of onnx model to quantize", + required=True, + ) + parser.add_argument( + "-bs", + "--block_size", + type=int, + help="The maximum size of quantization block", + required=True, + ) + parser.add_argument( + "-b", + "--bits", + type=int, + help="Quantization bits", + choices=[8, 16], + default=8, + required=False, + ) + parser.add_argument( + "-o", + "--output_model", + type=str, + help="The output model path", + default="block_quantized_model.onnx", + required=False, + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = setup_args() + + quantization_config = BlockQuantizeConfig( + input_model_path=args.input_model, + output_model_path=args.output_model, + block_size=args.block_size, + bits=args.bits, + ) + + quantizer = BlockQuantizer(quantization_config) + quantizer.run() diff --git a/tools/quantize/requirements.txt b/tools/quantize/requirements.txt index 3589a8f4..d8519a95 100644 --- a/tools/quantize/requirements.txt +++ b/tools/quantize/requirements.txt @@ -1,4 +1,5 @@ opencv-python>=4.10.0 +numpy onnx onnxruntime onnxruntime-extensions