diff --git a/src/nncf/openvino/graph/nncf_graph_builder.py b/src/nncf/openvino/graph/nncf_graph_builder.py index d14b6cd4946..334808da8c3 100644 --- a/src/nncf/openvino/graph/nncf_graph_builder.py +++ b/src/nncf/openvino/graph/nncf_graph_builder.py @@ -44,6 +44,8 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype: type_name = ov_type.get_type_name() conversion_map = { "nf4": "float", + "f4e2m1": "float", + "f8e8m0": "float", "f8e4m3": "float", "f8e5m2": "float", "f16": "float", diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index 622a63c1b7b..455ecb18279 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -107,17 +107,16 @@ def do_float_quantization( precomputed_scale: Optional[Tensor] = None, ) -> tuple[Tensor, Tensor, Tensor]: """ - Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. - For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved + Computes quantization scale if not provided, and performs corresponding float weight quantization. + NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6]. :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized weight tensor and corresponding scale tensor. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape @@ -129,7 +128,8 @@ def do_float_quantization( if weight.backend == TensorBackend.ov: # Return ov tensors in target precision to seamlessly insert them into openvino model later ov_model_params.return_ov_tensors = True - ov_model_params.output_dtypes.update({"compressed_weight": TensorDataType.nf4}) + weight_dtype = TensorDataType.f4e2m1 if config.mode == CompressWeightsMode.MXFP4 else TensorDataType.nf4 + ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype}) model = get_float_quantization_model( ov_model_params, @@ -235,7 +235,7 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] # When reduction axes are not provided, assuming that the weights are already reshaped if config.group_size != -1 and reduction_axes is not None: diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 97420969001..f64b5d020a7 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -286,7 +286,8 @@ def get_float_quantization_model( reduction_axes: Optional[ReductionAxes] = None, ) -> Union[ModelCallable, ModelAsNodes]: """ - Get a model that compresses weights to float (currently only nf4) destination type using the given configuration. + Get a model that compresses weights to float (currently nf4 or mxfp4) destination type using the given + configuration. :param ov_model_params: OV model parameters. :param config: Compression configuration. @@ -571,7 +572,7 @@ def _build_float_quantization_model( reduction_axes: Optional[ReductionAxes] = None, return_nodes: bool = False, ) -> Union[ModelCallable, ModelAsNodes]: - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] default_input_dtypes = {"scale": TensorDataType.float32} default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32} @@ -597,8 +598,7 @@ def _build_float_quantization_model( ) # Validate output dtypes - # TODO: add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved - valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4] + valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1] if compressed_weight_dtype not in valid_compressed_weight_dtypes: msg = ( f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " @@ -626,8 +626,16 @@ def _build_float_quantization_model( eps = np.finfo(np.float32).eps scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) + if config.mode == CompressWeightsMode.MXFP4: + scale = scale / opset.constant(6.0, ov.Type.f32) + scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32)) + scale = opset.ceil(scale) + scale = opset.clamp(scale, -127.0, 127.0) + scale = opset.power(opset.constant(2.0, ov.Type.f32), scale) + compressed_weight = divide_op(weight, scale) - compressed_weight = convert_op(compressed_weight, ov.Type.nf4) + target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1 + compressed_weight = convert_op(compressed_weight, target_dtype) compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) ov_results = [compressed_weight] diff --git a/src/nncf/quantization/algorithms/weight_compression/constants.py b/src/nncf/quantization/algorithms/weight_compression/constants.py index 6119fd8f83c..69f71839827 100644 --- a/src/nncf/quantization/algorithms/weight_compression/constants.py +++ b/src/nncf/quantization/algorithms/weight_compression/constants.py @@ -33,6 +33,27 @@ dtype=np.float32, ) +MXFP4_QUANTILES = np.array( + [ + -6.0, + -4.0, + -3.0, + -2.0, + -1.5, + -1.0, + -0.5, + -0.0, + 0.5, + 1.0, + 1.5, + 2.0, + 3.0, + 4.0, + 6.0, + ], + dtype=np.float32, +) + CB4_QUANTILES = np.array( [ @@ -77,3 +98,6 @@ ], dtype=np.float32, ) + + +CENTER_OF_MXFP4_QUANTILES = (MXFP4_QUANTILES[1:] + MXFP4_QUANTILES[:-1]) / 2 diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0e0783cf468..6d17dd28870 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -19,7 +19,9 @@ from nncf.errors import UnsupportedModelError from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_MXFP4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.constants import MXFP4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point @@ -147,15 +149,15 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization. - For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved + NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6]. + For MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. + For CODEBOOK currently returns normalized weight without quantization. :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and + :return: Returns quantized (for MXFP8_E4M3 and codebook normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ assert not config.is_integer @@ -165,7 +167,7 @@ def do_float_quantization( weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) # Optimized implementation - if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): + if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight): from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale) @@ -180,19 +182,19 @@ def do_float_quantization( if scale is None: scale = calculate_float_quantization_params(weight, reduction_axes, config) norm_weight = _calculate_normalized_weight(weight, scale) - if config.mode == CompressWeightsMode.NF4: + if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]: if original_weight_backend == TensorBackend.ov: - # Can convert through OpenVINO and return OpenVINO-native NF4 tensor - compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) + # Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor + target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 + compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype) else: - compressed_weight = _calculate_nf4_quantized_weight(norm_weight) + compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode) elif config.is_codebook: compressed_weight, indexes = _calculate_codebook_quantized_weight( norm_weight, quantiles=config.get_numpy_codebook() ) return compressed_weight, scale, indexes else: - # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved compressed_weight = norm_weight return compressed_weight, scale, None @@ -205,8 +207,8 @@ def float_quantize_dequantize_weight( return_compressed_weight: Optional[bool] = False, ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ - First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values. - MXFP4 and MXFP8_E4M3 mode is currently not supported. + First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values. + MXFP8_E4M3 mode is currently not supported. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -215,11 +217,15 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] - # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3, once ticket 164851 is resolved + assert config.mode in [ + CompressWeightsMode.NF4, + CompressWeightsMode.MXFP4, + CompressWeightsMode.CODEBOOK, + CompressWeightsMode.CB4_F8E4M3, + ] # Optimized implementation - if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): + if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight): from nncf.openvino.optimized_functions import ( float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov, ) @@ -508,17 +514,30 @@ def integer_quantize_dequantize_weight( return decompressed_weight -def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: +def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor: """ - Performs NF4 quantization. Look-up table is used to "round" or "quantize" to the closest quant. + Performs float (currently NF4 or MXFP4) quantization. Look-up table is used to "round" or "quantize" to the + closest quant. - :param norm_weight: Weight tensor to quantize already normalized to [-1, 1] range. - :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1]. + :param norm_weight: Normalized weight tensor to quantize. + :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants. """ - center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend) - indexes = fns.searchsorted(center_nf4_quantiles, norm_weight) - nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend) - quantized_weight = nf4_quantiles[indexes] + assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] + quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else MXFP4_QUANTILES + quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_MXFP4_QUANTILES + quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend) + indexes = fns.searchsorted(quantile_centers, norm_weight) + quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend) + + if mode == CompressWeightsMode.MXFP4: + # If in-between two quantiles, round to the nearest even quantile. + shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1) + dist_left = fns.abs(norm_weight - quantiles[indexes]) + dist_right = fns.abs(norm_weight - quantiles[shifted_indexes]) + choose_right = (dist_right < dist_left) | ((dist_left == dist_right) & ((shifted_indexes + 1) % 2 == 0)) + indexes = fns.where(choose_right, shifted_indexes, indexes) + + quantized_weight = quantiles[indexes] return quantized_weight diff --git a/src/nncf/tensor/definitions.py b/src/nncf/tensor/definitions.py index 6b91d4a48b8..da2bf4a6aee 100644 --- a/src/nncf/tensor/definitions.py +++ b/src/nncf/tensor/definitions.py @@ -44,6 +44,8 @@ class TensorDataType(StrEnum): float64 = auto() f8e4m3 = auto() f8e5m2 = auto() + f8e8m0 = auto() + f4e2m1 = auto() nf4 = auto() int8 = auto() int32 = auto() diff --git a/src/nncf/tensor/functions/openvino_numeric.py b/src/nncf/tensor/functions/openvino_numeric.py index 9b16516cad0..aadf75a8ccd 100644 --- a/src/nncf/tensor/functions/openvino_numeric.py +++ b/src/nncf/tensor/functions/openvino_numeric.py @@ -23,6 +23,8 @@ DTYPE_MAP: dict[TensorDataType, ov.Type] = { TensorDataType.nf4: ov.Type.nf4, + TensorDataType.f4e2m1: ov.Type.f4e2m1, + TensorDataType.f8e8m0: ov.Type.f8e8m0, TensorDataType.f8e4m3: ov.Type.f8e4m3, TensorDataType.f8e5m2: ov.Type.f8e5m2, TensorDataType.float16: ov.Type.f16, @@ -42,6 +44,8 @@ TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] @@ -95,7 +99,7 @@ def _(a: ov.Tensor, shape: Union[int, tuple[int, ...]]) -> ov.Tensor: @numeric.as_numpy_tensor.register def _(a: ov.Tensor) -> NDArray[Any]: - # Cannot convert bfloat16, uint4, int4, nf4, f8e4m3, f8e5m2 to numpy directly + # Cannot convert bfloat16, uint4, int4, nf4, f4e2m1, f8e8m0, f8e4m3, f8e5m2 to numpy directly a_dtype = DTYPE_MAP_REV[a.get_element_type()] if a_dtype in NATIVE_OV_CAST_DTYPES: dtype = TensorDataType.float32 diff --git a/src/nncf/tensor/tensor.py b/src/nncf/tensor/tensor.py index 1972e55ec4b..c8a05bc2c05 100644 --- a/src/nncf/tensor/tensor.py +++ b/src/nncf/tensor/tensor.py @@ -84,6 +84,12 @@ def __len__(self) -> int: # built-in operations + def __or__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return Tensor(self.data | unwrap_tensor_data(other)) + + def __and__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return Tensor(self.data & unwrap_tensor_data(other)) + def __add__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: return Tensor(self.data + unwrap_tensor_data(other)) @@ -144,6 +150,9 @@ def __ifloordiv__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: self._data //= unwrap_tensor_data(other) return self + def __mod__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return cast(Tensor, _call_function("_binary_op_nowarn", self, other, operator.mod)) + def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: return Tensor(self.data @ unwrap_tensor_data(other)) diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index e0e597e8fb1..fb1aa3d00ec 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -48,6 +48,8 @@ } BINARY_OPERATORS = ["add", "sub", "pow", "mul", "truediv", "floordiv"] +BOOLEAN_OPERATOR_MAP = {"and": operator.and_, "or": operator.or_} + COMPARISON_OPERATOR_MAP = { "lt": operator.lt, "le": operator.le, @@ -98,6 +100,25 @@ def test_operator_clone(self): assert id(tensor_a.data) is not id(tensor_b.data) assert all(tensor_a == tensor_b) + @pytest.mark.parametrize("op_name", BOOLEAN_OPERATOR_MAP.keys()) + @pytest.mark.parametrize("value", [True, False]) + def test_operators_bool(self, op_name, value): + tensor_a = self.to_tensor([True, False]) + + nncf_tensor_a = Tensor(tensor_a) + + fn = BOOLEAN_OPERATOR_MAP[op_name] + res = fn(tensor_a, value) + res_nncf = fn(nncf_tensor_a, value) + + assert res.dtype == res_nncf.data.dtype + assert all(res == res_nncf.data) + assert isinstance(res_nncf, Tensor) + if ( + self.backend() != TensorBackend.tf + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. + assert res_nncf.device == nncf_tensor_a.device + @pytest.mark.parametrize("op_name", OPERATOR_MAP.keys()) def test_operators_tensor(self, op_name): tensor_a = self.to_tensor([1.0, 2.0]) @@ -115,7 +136,7 @@ def test_operators_tensor(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", OPERATOR_MAP.keys()) @@ -134,7 +155,7 @@ def test_operators_int(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", BINARY_OPERATORS) @@ -153,7 +174,7 @@ def test_operators_int_rev(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", COMPARISON_OPERATOR_MAP.keys()) @@ -2114,6 +2135,8 @@ def test_fn_zeros(self): TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] @@ -2146,6 +2169,8 @@ def test_fn_eye(self, n, m, ref): TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 017d695c25e..692f51c42d3 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -47,7 +47,7 @@ from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION -from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_float_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization @@ -1799,7 +1799,7 @@ def test_nf4_quantization_mid_quant(weight, scale): scale = Tensor(scale) # norm_weight equals -0.8480964 (one bit away from the first NF4 quantile center) norm_weight = _calculate_normalized_weight(weight, scale) - nf4_quant = _calculate_nf4_quantized_weight(norm_weight) + nf4_quant = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.NF4) norm_weight_ov_backend = Tensor(ov.Tensor(norm_weight.data, norm_weight.shape, ov.Type.f32)) ref_nf4_quant = norm_weight_ov_backend.astype(TensorDataType.nf4).as_numpy_tensor() @@ -1807,6 +1807,33 @@ def test_nf4_quantization_mid_quant(weight, scale): np.testing.assert_allclose(nf4_quant.data, ref_nf4_quant.data, atol=0, rtol=0) +@pytest.mark.parametrize( + "input_val,expected_val,description", + [ + (-7.0, -6.0, "Lower than quantile range"), + (7.0, 6.0, "Higher than quantile range"), + (-5.0, -4.0, "Should pick nearest EVEN index (index 2: -4.0)"), + (-3.5, -4.0, "Should pick nearest EVEN index (index 2: -4.0)"), + (1.75, 2.0, "Should pick nearest EVEN index (index 12: 2.0)"), + (2.5, 2.0, "Should pick nearest EVEN index (index 12: 2.0)"), + (-4.0, -4.0, "Exactly on a quantile"), + (0.0, 0.0, "Value 0.0 is on quantile boundary"), + (-0.0, 0.0, "Value -0.0 is on quantile boundary"), + (-0.25, 0.0, "Should round up, 0.0 (even index)"), + (0.25, 0.0, "Should round down, 0.0 (even index)"), + (-0.49, -0.5, "Closer to -0.5"), + (-0.51, -0.5, "Closer to -0.5)"), + ], +) +def test_mxfp4_quantization_edge_cases(input_val, expected_val, description): + norm_weight = Tensor(np.array([input_val], dtype=np.float32)) + result = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.MXFP4) + + assert result.data[0] == expected_val, ( + f"{description}: Expected {expected_val}, got {result.data[0]} for input value {input_val}" + ) + + @pytest.mark.parametrize( "codebook", [ diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index d7f8a17bfbe..8803aad7d59 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -70,11 +70,12 @@ class QuantizationTask(Enum): FP4_COMPRESSION_CONFIGS = [ WeightCompressionConfig(CompressWeightsMode.NF4), WeightCompressionConfig(CompressWeightsMode.NF4, group_size=2), + WeightCompressionConfig(CompressWeightsMode.MXFP4, group_size=32), ] COMPRESSION_CONFIGS = INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS -WEIGHT_SHAPE = (10000, 4) +WEIGHT_SHAPE = (10000, 32) REDUCTION_AXES = (1,) @@ -249,6 +250,10 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_ np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0) # Save results for comparison between implementations + if group_size != -1 and not precompute_s_zp: + weight, _ = reshape_weight_for_grouped_quantization(weight, REDUCTION_AXES, group_size) + results[cb]["input"] = weight.as_numpy_tensor() + if quantization_task != QuantizationTask.Q: results[cb]["decompressed_weight"] = decompressed_weight if quantization_task != QuantizationTask.Q_DQ: @@ -302,7 +307,9 @@ def test_integer_quantization_error_alignment(weight_shape, config, tensor_backe else: mock.assert_called_once() - _check_values(results) + # It seems like numpy and openvino summate elements in different order during reduce_sum / reduce_mean computation. + # This results in small numerical differences. + _check_values(results, atol=1e-6) @pytest.mark.xfail( @@ -370,11 +377,14 @@ def get_input_node_data(node: ov.Node, input_id: int) -> Tensor: or compression_kwargs.get("lora_correction") ) - if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM]: - if weight_dtype in [TensorDataType.f8e4m3, TensorDataType.f8e5m2]: + if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM, CompressWeightsMode.MXFP4]: + if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM] and weight_dtype in [ + TensorDataType.f8e4m3, + TensorDataType.f8e5m2, + ]: pytest.skip("INT8 compression is not supported for f8 dtypes.") if is_data_aware: - pytest.skip("Data-aware compression is not supported for INT8 modes.") + pytest.skip("Data-aware compression is not supported for INT8 or MXFP4 modes.") else: compression_kwargs["all_layers"] = True @@ -466,12 +476,12 @@ def _check_backends_and_dtypes( and config.num_bits == 4 ): # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed - # zero point must be in ov backend and have (u)int4 or nf4 dtypes in order to be able to insert them into OV - # model without re-packing + # zero point must be in ov backend and have (u)int4/nf4/f4e2m1 dtypes in order to be able to insert them into + # OV model without re-packing if config.is_integer: ref_dtype = TensorDataType.uint4 if config.is_asym_mode else TensorDataType.int4 else: - ref_dtype = TensorDataType.nf4 + ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 assert compressed_weight.backend == TensorBackend.ov assert compressed_weight.dtype == ref_dtype if config.is_asym_mode and not precompute_s_zp: @@ -480,14 +490,17 @@ def _check_backends_and_dtypes( else: if quantization_task != QuantizationTask.Q_DQ: # Otherwise, for integer compression, compressed weight and zero point must be returned in numpy backend, - # compressed weight must be of (u)int8, zero point -- in int32; for nf4 compression, the resulting + # compressed weight must be of (u)int8, zero point -- in int32; for nf4/f4e2m1 compression, the resulting # data type and backend depends on the input tensor backend. if config.is_integer: ref_backend = TensorBackend.numpy ref_dtype = TensorDataType.uint8 if config.is_asym_mode else TensorDataType.int8 else: ref_backend = weight_tensor_backend - ref_dtype = TensorDataType.nf4 if weight_tensor_backend == TensorBackend.ov else TensorDataType.float32 + if weight_tensor_backend == TensorBackend.ov: + ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 + else: + ref_dtype = TensorDataType.float32 assert compressed_weight.backend == ref_backend assert compressed_weight.dtype == ref_dtype if config.is_asym_mode and not precompute_s_zp: @@ -498,7 +511,10 @@ def _check_backends_and_dtypes( assert decompressed_weight.dtype == TensorDataType.float32 -def _check_values(results): +def _check_values(results, atol=0.0): + def format_list_of_floats(lst): + return ", ".join(f"{x:.10f}" for x in lst) + # Check that the computed tensors are equal between implementations keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) for key in keys: @@ -506,12 +522,29 @@ def _check_values(results): ov_result = results[ComputationBackend.OV][key] if isinstance(numpy_result, float) and isinstance(ov_result, float): - numpy_result = np.array([numpy_result], dtype=np.float32) - ov_result = np.array([ov_result], dtype=np.float32) + numpy_result = Tensor(np.array([numpy_result], dtype=np.float32)) + ov_result = Tensor(np.array([ov_result], dtype=np.float32)) # Note: For static-shaped OV models doing asymmetric compression with convertable divisions there maybe # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511. - np.testing.assert_allclose( - ov_result.data, numpy_result.data, atol=0, rtol=0, err_msg=f"Results do not align for {key}." - ) + try: + np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=0) + except AssertionError: + not_equal_mask = np.not_equal(ov_result.data, numpy_result.data) + msg = ( + f"Results do not align for {key} with " + f"{not_equal_mask.sum() / ov_result.data.size * 100:.2f} % misalignment ratio.\n" + f"OV result: {format_list_of_floats(ov_result.data[not_equal_mask])}\n" + f"NumPy result: {format_list_of_floats(numpy_result.data[not_equal_mask])}\n" + ) + if "input" in results[ComputationBackend.OV] and "input" in results[ComputationBackend.NumPy]: + numpy_input = results[ComputationBackend.NumPy]["input"].data + ov_input = results[ComputationBackend.OV]["input"].data + np.testing.assert_allclose(numpy_input, ov_input, atol=0, rtol=0) + msg += f"Input values : {format_list_of_floats(numpy_input[not_equal_mask])}\n" + misaligned_groups_mask = np.any(not_equal_mask, axis=-1) + misaligned_groups = numpy_input[misaligned_groups_mask, ...] + misaligned_groups = np.reshape(misaligned_groups, (-1, misaligned_groups.shape[-1])) + msg += f"First 10 misaligned groups: {[it for it in misaligned_groups][:10]}\n" + raise AssertionError(msg)