Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from typing import Optional, Union

import nncf
from nncf import CompressWeightsMode
from nncf.common.utils.caching import disable_results_caching
from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
Expand Down Expand Up @@ -274,6 +275,7 @@ def get_integer_quantization_error(
weight: Tensor,
reduction_axes: ReductionAxes,
config: WeightCompressionConfig,
reduction: str,
) -> float:
"""
Calculates a quantity characterizing the difference between floating point weights and fake quantized
Expand All @@ -285,8 +287,13 @@ def get_integer_quantization_error(
:param weight: Weight array to compress.
:param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
:param config: Information on how to compress (quantize) a specific weight.
:param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius".
:return: The quantity characterizing the error of integer quantization.
"""
if reduction not in ["max_mean", "frobenius"]:
exception_str = f"Unsupported aggregation mode: {reduction}."
raise nncf.InternalError(exception_str)

original_weight_shape = weight.shape
original_reduction_axes = reduction_axes

Expand All @@ -298,7 +305,7 @@ def get_integer_quantization_error(
ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
model = get_integer_quantization_error_model(
ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes
ov_model_params, config, reduction, weight.shape, reduction_axes, original_weight_shape, original_reduction_axes
)

quantization_error = model([weight])[0].item()
Expand Down
44 changes: 30 additions & 14 deletions src/nncf/openvino/optimized_functions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,11 @@ def get_integer_quantize_dequantize_weight_model(
def get_integer_quantization_error_model(
ov_model_params: OVModelParameters,
config: WeightCompressionConfig,
original_weight_shape: tuple,
reduction: str,
weight_shape: tuple,
original_reduction_axes: ReductionAxes,
reduction_axes: ReductionAxes,
original_weight_shape: tuple,
original_reduction_axes: ReductionAxes,
) -> ModelCallable:
"""
Get a model that calculates the quantization error for a given weight.
Expand All @@ -403,16 +404,23 @@ def get_integer_quantization_error_model(

:param ov_model_params: OV model parameters.
:param config: Compression configuration.
:param original_weight_shape: Shape of the original weight tensor.
:param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius".
:param weight_shape: Shape of the weight tensor to be compressed.
:param original_reduction_axes: Reduction axes of the original weight tensor before reshaping.
:param reduction_axes: Axes to reduce the weight tensor.
:param original_weight_shape: Shape of the original weight tensor.
:param original_reduction_axes: Reduction axes of the original weight tensor before reshaping.
:return: A model callable that returns the quantization error.
"""
weight_shape, _, _ = _prepare_quantization_model_inputs(ov_model_params, weight_shape, None, None, reduction_axes)

return _build_integer_quantization_error_model(
config, ov_model_params, original_weight_shape, weight_shape, original_reduction_axes, reduction_axes
config,
ov_model_params,
reduction,
weight_shape,
reduction_axes,
original_weight_shape,
original_reduction_axes,
)


Expand Down Expand Up @@ -769,10 +777,11 @@ def _build_float_quantize_dequantize_weight_model(
def _build_integer_quantization_error_model(
config: WeightCompressionConfig,
ov_model_params: OVModelParameters,
original_weight_shape: tuple,
reduction: str,
weight_shape: tuple,
original_reduction_axes: ReductionAxes,
reduction_axes: ReductionAxes,
original_weight_shape: tuple,
original_reduction_axes: ReductionAxes,
) -> ModelCallable:
ov_parameters, ov_results, ov_model_params = _build_integer_quantize_dequantize_weight_model(
config,
Expand All @@ -786,13 +795,20 @@ def _build_integer_quantization_error_model(
weight = ov_parameters[0]
decompressed_weight = ov_results[0]

weight = convert_op(opset.reshape(weight, original_weight_shape, special_zero=False), ov.Type.f32)
decompressed_weight = convert_op(
opset.reshape(decompressed_weight, original_weight_shape, special_zero=False), ov.Type.f32
)
diff = opset.squared_difference(decompressed_weight, weight)
layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes)
quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape))))
weight = convert_op(weight, ov.Type.f32)
if reduction == "max_mean":
weight = opset.reshape(weight, original_weight_shape, special_zero=False)
decompressed_weight = opset.reshape(decompressed_weight, original_weight_shape, special_zero=False)
diff = opset.squared_difference(decompressed_weight, weight)
layer_err = opset.reduce_mean(diff, reduction_axes=original_reduction_axes)
quantization_error = opset.reduce_max(layer_err, reduction_axes=tuple(range(len(layer_err.shape))))
elif reduction == "frobenius":
diff = opset.reshape(decompressed_weight - weight, (-1,), special_zero=False)
quantization_error = opset.matmul(diff, diff, transpose_a=False, transpose_b=False)
quantization_error = opset.sqrt(quantization_error)
else:
msg = f"Unsupported aggregation method: {reduction}."
raise ValueError(msg)

model = ov.Model([quantization_error], ov_parameters)
compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
from nncf.quantization.algorithms.weight_compression.weight_lowering import integer_quantize_dequantize_weight
from nncf.tensor import Tensor
from nncf.tensor import functions as fns
from nncf.tensor.definitions import TensorDataType

TModel = TypeVar("TModel")
MIXED_PRECISION_CRITERIA = Registry("mixed_precision_criteria")
Expand Down Expand Up @@ -174,7 +172,7 @@ def _calc_weight_sensitivity(
)
backup_config = WeightCompressionConfig()
reduction_axes = weight_param.reduction_axes
int_error = get_integer_quantization_error(weight, reduction_axes, backup_config)
int_error = get_integer_quantization_error(weight, reduction_axes, backup_config, reduction="max_mean")
eps = fns.finfo(weight).eps
return 1 / (int_error + eps)

Expand Down Expand Up @@ -360,15 +358,7 @@ def _calc_weight_sensitivity(
)
backup_config = WeightCompressionConfig()
reduction_axes = weight_param.reduction_axes

orig_shape = weight.shape

if weight.dtype != TensorDataType.float32:
weight = weight.astype(TensorDataType.float32)

decompressed_weight = integer_quantize_dequantize_weight(weight, backup_config, reduction_axes)
decompressed_weight = decompressed_weight.reshape(orig_shape)
return fns.linalg.norm(decompressed_weight - weight, ord="fro").item()
return get_integer_quantization_error(weight, reduction_axes, backup_config, reduction="frobenius")

def _get_statistic_collector(self):
return self._backend_entity.hawq_statistic_collector(self._subset_size)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def get_integer_quantization_error(
weight: Tensor,
reduction_axes: ReductionAxes,
config: WeightCompressionConfig,
reduction: str,
) -> float:
"""
Calculates a quantity characterizing the difference between floating point weights and fake quantized
Expand All @@ -310,29 +311,35 @@ def get_integer_quantization_error(
:param weight: Weight array to compress.
:param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
:param config: Information on how to compress (quantize) a specific weight.
:param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius".
:return: The quantity characterizing the error of integer quantization.
"""
if reduction not in ["max_mean", "frobenius"]:
exception_str = f"Unsupported aggregation mode: {reduction}."
raise nncf.InternalError(exception_str)

# Optimized implementation
if _can_run_optimized(weight, config.mode):
from nncf.openvino.optimized_functions import (
get_integer_quantization_error as get_integer_quantization_error_ov,
)

return get_integer_quantization_error_ov(weight, reduction_axes, config)
return get_integer_quantization_error_ov(weight, reduction_axes, config, reduction)

if weight.backend == TensorBackend.ov:
weight = weight.as_numpy_tensor()
orig_shape = weight.shape

if weight.dtype != TensorDataType.float32:
weight = weight.astype(TensorDataType.float32)

decompressed_weight = integer_quantize_dequantize_weight(weight, config, reduction_axes)

decompressed_weight = decompressed_weight.reshape(orig_shape)
diff = (decompressed_weight - weight) ** 2
layer_err = fns.mean(diff, axis=reduction_axes)
val = fns.max(layer_err)
decompressed_weight = decompressed_weight.reshape(weight.shape)
if reduction == "max_mean":
diff = (decompressed_weight - weight) ** 2
layer_err = fns.mean(diff, axis=reduction_axes)
val = fns.max(layer_err)
else:
val = fns.linalg.norm(decompressed_weight - weight, ord="fro")
return val.item()


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ def __str__(self):
def test_quantization_error_calculation(desc: QuantErrorDesc):
weight = Tensor(desc.weight)
axis = 1
actual_error = get_integer_quantization_error(weight, axis, desc.config)
actual_error = get_integer_quantization_error(weight, axis, desc.config, reduction="max_mean")
ref_error = desc.ref_error
atol = desc.atol if desc.atol is not None else 1e-8
assert np.allclose(actual_error, ref_error, atol=atol)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_
@pytest.mark.parametrize("config", INT4_COMPRESSION_CONFIGS, ids=[str(c) for c in INT4_COMPRESSION_CONFIGS])
@pytest.mark.parametrize("tensor_backend", [TensorBackend.numpy, "auto"])
@pytest.mark.parametrize("dtype", SUPPORTED_WEIGHT_DTYPES)
def test_integer_quantization_error_alignment(weight_shape, config, tensor_backend, dtype):
@pytest.mark.parametrize("reduction", ["max_mean", "frobenius"])
def test_integer_quantization_error_alignment(weight_shape, config, tensor_backend, dtype, reduction):
results = defaultdict(dict)
# Iterate over two implementations
for cb in [ComputationBackend.NumPy, ComputationBackend.OV]:
Expand All @@ -289,16 +290,20 @@ def test_integer_quantization_error_alignment(weight_shape, config, tensor_backe
fn_to_patch = opt_fns.get_integer_quantization_error
patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}"
with patch(patch_path, side_effect=fn_to_patch) as mock:
results[cb]["quantization_error"] = get_integer_quantization_error(weight, REDUCTION_AXES, config)
results[cb]["quantization_error"] = get_integer_quantization_error(
weight, REDUCTION_AXES, config, reduction=reduction
)

if cb == ComputationBackend.NumPy:
mock.assert_not_called()
else:
mock.assert_called_once()

# It seems like numpy and openvino summate elements in different order during reduce_sum / reduce_mean computation.
# This results in small numerical differences.
_check_values(results, atol=1e-6)
# For "max_mean", it seems like numpy and openvino summate elements in different order during
# reduce_sum / reduce_mean computation. This results in small numerical differences.
# For "frobenius", there is a bit larger error, possibly because np.linalg.norm relies on BLAS/LAPACK
# implementations.
_check_values(results, atol=1e-6 if reduction == "max_mean" else 0, rtol=1e-4 if reduction == "frobenius" else 0)


@pytest.mark.parametrize("weight_shape", [WEIGHT_SHAPE], ids=[""])
Expand Down Expand Up @@ -498,7 +503,7 @@ def _check_backends_and_dtypes(
assert decompressed_weight.dtype == TensorDataType.float32


def _check_values(results, atol=0.0):
def _check_values(results, atol=0.0, rtol=0.0):
def format_list_of_floats(lst, n_first=32):
return ", ".join(f"{x:.10f}" for x in lst[:n_first])

Expand All @@ -516,7 +521,7 @@ def format_list_of_floats(lst, n_first=32):
# misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511.

try:
np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=0)
np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=rtol)
except AssertionError:
not_equal_mask = np.not_equal(ov_result.data, numpy_result.data)
msg = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def get(self, ov_model_params_kwargs=None, get_model_kwargs=None):
weight_shape=(10, 2, 2),
original_reduction_axes=(1,),
reduction_axes=(2,),
reduction="max_mean",
),
),
ModelGetter(
Expand Down