diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index cb9845bfc22..077cfbd7db2 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -11,6 +11,7 @@ from typing import Optional, Union +import nncf from nncf import CompressWeightsMode from nncf.common.utils.caching import disable_results_caching from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE @@ -274,6 +275,7 @@ def get_integer_quantization_error( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, + reduction: str, ) -> float: """ Calculates a quantity characterizing the difference between floating point weights and fake quantized @@ -285,8 +287,13 @@ def get_integer_quantization_error( :param weight: Weight array to compress. :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). :param config: Information on how to compress (quantize) a specific weight. + :param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius". :return: The quantity characterizing the error of integer quantization. """ + if reduction not in ["max_mean", "frobenius"]: + exception_str = f"Unsupported aggregation mode: {reduction}." + raise nncf.InternalError(exception_str) + original_weight_shape = weight.shape original_reduction_axes = reduction_axes @@ -298,7 +305,7 @@ def get_integer_quantization_error( ov_model_params = OVModelParameters() ov_model_params.input_dtypes["weight"] = weight.dtype model = get_integer_quantization_error_model( - ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes + ov_model_params, config, reduction, weight.shape, reduction_axes, original_weight_shape, original_reduction_axes ) quantization_error = model([weight])[0].item() diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 64be0e0b134..f6b32b0b195 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -390,10 +390,11 @@ def get_integer_quantize_dequantize_weight_model( def get_integer_quantization_error_model( ov_model_params: OVModelParameters, config: WeightCompressionConfig, - original_weight_shape: tuple, + reduction: str, weight_shape: tuple, - original_reduction_axes: ReductionAxes, reduction_axes: ReductionAxes, + original_weight_shape: tuple, + original_reduction_axes: ReductionAxes, ) -> ModelCallable: """ Get a model that calculates the quantization error for a given weight. @@ -403,16 +404,23 @@ def get_integer_quantization_error_model( :param ov_model_params: OV model parameters. :param config: Compression configuration. - :param original_weight_shape: Shape of the original weight tensor. + :param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius". :param weight_shape: Shape of the weight tensor to be compressed. - :param original_reduction_axes: Reduction axes of the original weight tensor before reshaping. :param reduction_axes: Axes to reduce the weight tensor. + :param original_weight_shape: Shape of the original weight tensor. + :param original_reduction_axes: Reduction axes of the original weight tensor before reshaping. :return: A model callable that returns the quantization error. """ weight_shape, _, _ = _prepare_quantization_model_inputs(ov_model_params, weight_shape, None, None, reduction_axes) return _build_integer_quantization_error_model( - config, ov_model_params, original_weight_shape, weight_shape, original_reduction_axes, reduction_axes + config, + ov_model_params, + reduction, + weight_shape, + reduction_axes, + original_weight_shape, + original_reduction_axes, ) @@ -769,10 +777,11 @@ def _build_float_quantize_dequantize_weight_model( def _build_integer_quantization_error_model( config: WeightCompressionConfig, ov_model_params: OVModelParameters, - original_weight_shape: tuple, + reduction: str, weight_shape: tuple, - original_reduction_axes: ReductionAxes, reduction_axes: ReductionAxes, + original_weight_shape: tuple, + original_reduction_axes: ReductionAxes, ) -> ModelCallable: ov_parameters, ov_results, ov_model_params = _build_integer_quantize_dequantize_weight_model( config, @@ -786,13 +795,20 @@ def _build_integer_quantization_error_model( weight = ov_parameters[0] decompressed_weight = ov_results[0] - weight = convert_op(opset.reshape(weight, original_weight_shape, special_zero=False), ov.Type.f32) - decompressed_weight = convert_op( - opset.reshape(decompressed_weight, original_weight_shape, special_zero=False), ov.Type.f32 - ) - diff = opset.squared_difference(decompressed_weight, weight) - layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes) - quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape)))) + weight = convert_op(weight, ov.Type.f32) + if reduction == "max_mean": + weight = opset.reshape(weight, original_weight_shape, special_zero=False) + decompressed_weight = opset.reshape(decompressed_weight, original_weight_shape, special_zero=False) + diff = opset.squared_difference(decompressed_weight, weight) + layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes) + quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape)))) + elif reduction == "frobenius": + diff = opset.reshape(decompressed_weight - weight, (-1,), special_zero=False) + quantization_error = opset.matmul(diff, diff, transpose_a=False, transpose_b=False) + quantization_error = opset.sqrt(quantization_error) + else: + msg = f"Unsupported aggregation method: {reduction}." + raise ValueError(msg) model = ov.Model([quantization_error], ov_parameters) compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32}) diff --git a/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py index fdb51b8d69c..8dd08dd397b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/src/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -29,10 +29,8 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error -from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight from nncf.tensor import Tensor from nncf.tensor import functions as fns -from nncf.tensor.definitions import TensorDataType TModel = TypeVar("TModel") MIXED_PRECISION_CRITERIA = Registry("mixed_precision_criteria") @@ -174,7 +172,7 @@ def _calc_weight_sensitivity( ) backup_config = WeightCompressionConfig() reduction_axes = weight_param.reduction_axes - int_error = get_integer_quantization_error(weight, reduction_axes, backup_config) + int_error = get_integer_quantization_error(weight, reduction_axes, backup_config, reduction="max_mean") eps = fns.finfo(weight).eps return 1 / (int_error + eps) @@ -360,15 +358,7 @@ def _calc_weight_sensitivity( ) backup_config = WeightCompressionConfig() reduction_axes = weight_param.reduction_axes - - orig_shape = weight.shape - - if weight.dtype != TensorDataType.float32: - weight = weight.astype(TensorDataType.float32) - - decompressed_weight = integer_quantize_dequantize_weight(weight, backup_config, reduction_axes) - decompressed_weight = decompressed_weight.reshape(orig_shape) - return fns.linalg.norm(decompressed_weight - weight, ord="fro").item() + return get_integer_quantization_error(weight, reduction_axes, backup_config, reduction="frobenius") def _get_statistic_collector(self): return self._backend_entity.hawq_statistic_collector(self._subset_size) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index d0c96e952fb..808646351ba 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -299,6 +299,7 @@ def get_integer_quantization_error( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, + reduction: str, ) -> float: """ Calculates a quantity characterizing the difference between floating point weights and fake quantized @@ -310,29 +311,35 @@ def get_integer_quantization_error( :param weight: Weight array to compress. :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max). :param config: Information on how to compress (quantize) a specific weight. + :param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius". :return: The quantity characterizing the error of integer quantization. """ + if reduction not in ["max_mean", "frobenius"]: + exception_str = f"Unsupported aggregation mode: {reduction}." + raise nncf.InternalError(exception_str) + # Optimized implementation if _can_run_optimized(weight, config.mode): from nncf.openvino.optimized_functions import ( get_integer_quantization_error as get_integer_quantization_error_ov, ) - return get_integer_quantization_error_ov(weight, reduction_axes, config) + return get_integer_quantization_error_ov(weight, reduction_axes, config, reduction) if weight.backend == TensorBackend.ov: weight = weight.as_numpy_tensor() - orig_shape = weight.shape if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) decompressed_weight = integer_quantize_dequantize_weight(weight, config, reduction_axes) - - decompressed_weight = decompressed_weight.reshape(orig_shape) - diff = (decompressed_weight - weight) ** 2 - layer_err = fns.mean(diff, axis=reduction_axes) - val = fns.max(layer_err) + decompressed_weight = decompressed_weight.reshape(weight.shape) + if reduction == "max_mean": + diff = (decompressed_weight - weight) ** 2 + layer_err = fns.mean(diff, axis=reduction_axes) + val = fns.max(layer_err) + else: + val = fns.linalg.norm(decompressed_weight - weight, ord="fro") return val.item() diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5a38bd3a79b..e1a337f7005 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -730,7 +730,7 @@ def __str__(self): def test_quantization_error_calculation(desc: QuantErrorDesc): weight = Tensor(desc.weight) axis = 1 - actual_error = get_integer_quantization_error(weight, axis, desc.config) + actual_error = get_integer_quantization_error(weight, axis, desc.config, reduction="max_mean") ref_error = desc.ref_error atol = desc.atol if desc.atol is not None else 1e-8 assert np.allclose(actual_error, ref_error, atol=atol) diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 6a103dab25d..1ba6dfc477f 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -274,7 +274,8 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_ @pytest.mark.parametrize("config", INT4_COMPRESSION_CONFIGS, ids=[str(c) for c in INT4_COMPRESSION_CONFIGS]) @pytest.mark.parametrize("tensor_backend", [TensorBackend.numpy, "auto"]) @pytest.mark.parametrize("dtype", SUPPORTED_WEIGHT_DTYPES) -def test_integer_quantization_error_alignment(weight_shape, config, tensor_backend, dtype): +@pytest.mark.parametrize("reduction", ["max_mean", "frobenius"]) +def test_integer_quantization_error_alignment(weight_shape, config, tensor_backend, dtype, reduction): results = defaultdict(dict) # Iterate over two implementations for cb in [ComputationBackend.NumPy, ComputationBackend.OV]: @@ -289,16 +290,20 @@ def test_integer_quantization_error_alignment(weight_shape, config, tensor_backe fn_to_patch = opt_fns.get_integer_quantization_error patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}" with patch(patch_path, side_effect=fn_to_patch) as mock: - results[cb]["quantization_error"] = get_integer_quantization_error(weight, REDUCTION_AXES, config) + results[cb]["quantization_error"] = get_integer_quantization_error( + weight, REDUCTION_AXES, config, reduction=reduction + ) if cb == ComputationBackend.NumPy: mock.assert_not_called() else: mock.assert_called_once() - # It seems like numpy and openvino summate elements in different order during reduce_sum / reduce_mean computation. - # This results in small numerical differences. - _check_values(results, atol=1e-6) + # For "max_mean", it seems like numpy and openvino summate elements in different order during + # reduce_sum / reduce_mean computation. This results in small numerical differences. + # For "frobenius", there is a bit larger error, possibly because np.linalg.norm relies on BLAS/LAPACK + # implementations. + _check_values(results, atol=1e-6 if reduction == "max_mean" else 0, rtol=1e-4 if reduction == "frobenius" else 0) @pytest.mark.parametrize("weight_shape", [WEIGHT_SHAPE], ids=[""]) @@ -498,7 +503,7 @@ def _check_backends_and_dtypes( assert decompressed_weight.dtype == TensorDataType.float32 -def _check_values(results, atol=0.0): +def _check_values(results, atol=0.0, rtol=0.0): def format_list_of_floats(lst, n_first=32): return ", ".join(f"{x:.10f}" for x in lst[:n_first]) @@ -516,7 +521,7 @@ def format_list_of_floats(lst, n_first=32): # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511. try: - np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=0) + np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=rtol) except AssertionError: not_equal_mask = np.not_equal(ov_result.data, numpy_result.data) msg = ( diff --git a/tests/openvino/optimized_functions/test_ov_model_parameters.py b/tests/openvino/optimized_functions/test_ov_model_parameters.py index 12df2a7c461..a3081cf7d8e 100644 --- a/tests/openvino/optimized_functions/test_ov_model_parameters.py +++ b/tests/openvino/optimized_functions/test_ov_model_parameters.py @@ -145,6 +145,7 @@ def get(self, ov_model_params_kwargs=None, get_model_kwargs=None): weight_shape=(10, 2, 2), original_reduction_axes=(1,), reduction_axes=(2,), + reduction="max_mean", ), ), ModelGetter(