From a42f1eb4fc477fb1c40ba466a2a4ba11842a5572 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 17 Jun 2025 19:36:16 +0200 Subject: [PATCH 01/21] WIP --- nncf/openvino/graph/nncf_graph_builder.py | 2 + .../openvino/optimized_functions/functions.py | 13 ++-- nncf/openvino/optimized_functions/models.py | 18 +++-- .../weight_compression/weight_lowering.py | 75 ++++++++++++------- nncf/tensor/definitions.py | 2 + nncf/tensor/functions/openvino_numeric.py | 8 +- .../quantization/test_weights_compression.py | 4 +- .../test_compression_functions.py | 64 +++++++++++----- 8 files changed, 127 insertions(+), 59 deletions(-) diff --git a/nncf/openvino/graph/nncf_graph_builder.py b/nncf/openvino/graph/nncf_graph_builder.py index d14b6cd4946..334808da8c3 100644 --- a/nncf/openvino/graph/nncf_graph_builder.py +++ b/nncf/openvino/graph/nncf_graph_builder.py @@ -44,6 +44,8 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype: type_name = ov_type.get_type_name() conversion_map = { "nf4": "float", + "f4e2m1": "float", + "f8e8m0": "float", "f8e4m3": "float", "f8e5m2": "float", "f16": "float", diff --git a/nncf/openvino/optimized_functions/functions.py b/nncf/openvino/optimized_functions/functions.py index 2a11e4c3608..b8842cf3585 100644 --- a/nncf/openvino/optimized_functions/functions.py +++ b/nncf/openvino/optimized_functions/functions.py @@ -107,17 +107,15 @@ def do_float_quantization( precomputed_scale: Optional[Tensor] = None, ) -> tuple[Tensor, Tensor]: """ - Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. - For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved + Computes quantization scale if not provided, and performs corresponding float weight quantization. :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized weight tensor and corresponding scale tensor. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape @@ -129,7 +127,8 @@ def do_float_quantization( if weight.backend == TensorBackend.ov: # Return ov tensors in target precision to seamlessly insert them into openvino model later ov_model_params.return_ov_tensors = True - ov_model_params.output_dtypes.update({"compressed_weight": TensorDataType.nf4}) + weight_dtype = TensorDataType.f4e2m1 if config.mode == CompressWeightsMode.E2M1 else TensorDataType.nf4 + ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype}) model = get_float_quantization_model( ov_model_params, @@ -235,7 +234,7 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] # When reduction axes are not provided, assuming that the weights are already reshaped if config.group_size != -1 and reduction_axes is not None: diff --git a/nncf/openvino/optimized_functions/models.py b/nncf/openvino/optimized_functions/models.py index 4002509697e..ae8ac504d20 100644 --- a/nncf/openvino/optimized_functions/models.py +++ b/nncf/openvino/optimized_functions/models.py @@ -244,7 +244,8 @@ def get_float_quantization_model( reduction_axes: Optional[ReductionAxes] = None, ) -> Union[ModelCallable, ModelAsNodes]: """ - Get a model that compresses weights to float (currently only nf4) destination type using the given configuration. + Get a model that compresses weights to float (currently nf4 or f4e2m1) destination type using the given + configuration. :param ov_model_params: OV model parameters. :param config: Compression configuration. @@ -533,7 +534,7 @@ def _build_float_quantization_model( reduction_axes: Optional[ReductionAxes] = None, return_nodes: bool = False, ) -> Union[ModelCallable, ModelAsNodes]: - assert config.mode == CompressWeightsMode.NF4 + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] default_input_dtypes = {"scale": TensorDataType.float32} default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32} @@ -562,8 +563,7 @@ def _build_float_quantization_model( raise ValueError(msg) # Validate output dtypes - # TODO: add support for f4e2m1 once ticket 164851 is resolved - valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4] + valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1] if compressed_weight_dtype not in valid_compressed_weight_dtypes: msg = ( f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " @@ -591,8 +591,16 @@ def _build_float_quantization_model( eps = np.finfo(np.float32).eps scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) + if config.mode == CompressWeightsMode.E2M1: + scale = scale / opset.constant(6.0, ov.Type.f32) + scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32)) + scale = opset.ceil(scale) + scale = opset.clamp(scale, -127.0, 127.0) + scale = opset.power(opset.constant(2.0, ov.Type.f32), scale) + compressed_weight = divide_op(weight, scale) - compressed_weight = convert_op(compressed_weight, ov.Type.nf4) + target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1 + compressed_weight = convert_op(compressed_weight, target_dtype) compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) ov_results = [compressed_weight] diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 12522e0fa60..8a3aff73c16 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -71,6 +71,29 @@ dtype=np.float32, ) +E2M1_QUANTILES = np.array( + [ + -6.0, + -4.0, + -3.0, + -2.0, + -1.5, + -1.0, + -0.5, + -0.0, + 0.5, + 1.0, + 1.5, + 2.0, + 3.0, + 4.0, + 6.0, + ], + dtype=np.float32, +) + +CENTER_OF_E2M1_QUANTILES = (E2M1_QUANTILES[1:] + E2M1_QUANTILES[:-1]) / 2 + MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 @@ -124,7 +147,7 @@ def reshape_weight_for_grouped_quantization( def calculate_float_quantization_params( - weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, max_val=6.0 + weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ Calculates the scale for nf4 or e2m1 quantization. @@ -132,7 +155,6 @@ def calculate_float_quantization_params( :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). :param config: Weight compression configuration. - :param max_val: Maximal value of e2m1 type. :return: Scale tensor of float32 type for float quantization. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] @@ -147,7 +169,7 @@ def calculate_float_quantization_params( scale = fns.where(fns.abs(scale) < eps, eps, scale) if config.mode == CompressWeightsMode.E2M1: - scale = scale / max_val + scale = scale / 6.0 scale = fns.log2(scale) scale = fns.ceil(scale) scale = fns.clip(scale, -127, 127) @@ -181,14 +203,13 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For E2M1 currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved + For E2M1 quantization quantizes the weights to 16 levels on [-6, 6] interval. :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized weight tensor and corresponding scale tensor. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] @@ -197,7 +218,7 @@ def do_float_quantization( weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) # Optimized implementation - if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): + if _can_run_optimized(weight): from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale) @@ -212,15 +233,13 @@ def do_float_quantization( if scale is None: scale = calculate_float_quantization_params(weight, reduction_axes, config) norm_weight = _calculate_normalized_weight(weight, scale) - if config.mode == CompressWeightsMode.NF4: - if original_weight_backend == TensorBackend.ov: - # Can convert through OpenVINO and return OpenVINO-native NF4 tensor - compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4) - else: - compressed_weight = _calculate_nf4_quantized_weight(norm_weight) + if original_weight_backend == TensorBackend.ov: + # Can convert through OpenVINO and return OpenVINO-native tensor + target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 + compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype) else: - # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved - compressed_weight = norm_weight + compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode) + # compressed_weight = norm_weight return compressed_weight, scale @@ -232,8 +251,7 @@ def float_quantize_dequantize_weight( return_compressed_weight: Optional[bool] = False, ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ - First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values. - E2M1 mode is currently not supported. + First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -242,8 +260,7 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode == CompressWeightsMode.NF4 - # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved + assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] # Optimized implementation if _can_run_optimized(weight): @@ -523,17 +540,21 @@ def integer_quantize_dequantize_weight( return decompressed_weight -def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor: +def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor: """ - Performs NF4 quantization. Look-up table is used to "round" or "quantize" to the closest quant. + Performs float (currently NF4 or F4E2M1) quantization. Look-up table is used to "round" or "quantize" to the + closest quant. - :param norm_weight: Weight tensor to quantize already normalized to [-1, 1] range. - :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1]. + :param norm_weight: Weight tensor to quantize. + :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants. """ - center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend) - indexes = fns.searchsorted(center_nf4_quantiles, norm_weight) - nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend) - quantized_weight = nf4_quantiles[indexes] + assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] + quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else E2M1_QUANTILES + quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_E2M1_QUANTILES + quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend) + indexes = fns.searchsorted(quantile_centers, norm_weight) + quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend) + quantized_weight = quantiles[indexes] return quantized_weight diff --git a/nncf/tensor/definitions.py b/nncf/tensor/definitions.py index 8072ce9170a..e549bb86fec 100644 --- a/nncf/tensor/definitions.py +++ b/nncf/tensor/definitions.py @@ -42,6 +42,8 @@ class TensorDataType(Enum): float64 = auto() f8e4m3 = auto() f8e5m2 = auto() + f8e8m0 = auto() + f4e2m1 = auto() nf4 = auto() int8 = auto() int32 = auto() diff --git a/nncf/tensor/functions/openvino_numeric.py b/nncf/tensor/functions/openvino_numeric.py index 686478e4f5f..3e2f56d5f23 100644 --- a/nncf/tensor/functions/openvino_numeric.py +++ b/nncf/tensor/functions/openvino_numeric.py @@ -23,6 +23,8 @@ DTYPE_MAP: dict[TensorDataType, ov.Type] = { TensorDataType.nf4: ov.Type.nf4, + TensorDataType.f4e2m1: ov.Type.f4e2m1, + TensorDataType.f8e8m0: ov.Type.f8e8m0, TensorDataType.f8e4m3: ov.Type.f8e4m3, TensorDataType.f8e5m2: ov.Type.f8e5m2, TensorDataType.float16: ov.Type.f16, @@ -67,6 +69,8 @@ def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor: TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] @@ -94,13 +98,15 @@ def _(a: ov.Tensor, shape: Union[int, tuple[int, ...]]) -> ov.Tensor: @numeric.as_numpy_tensor.register def _(a: ov.Tensor) -> NDArray[Any]: - # Cannot convert bfloat16, uint4, int4, nf4, f8e4m3, f8e5m2 to numpy directly + # Cannot convert bfloat16, uint4, int4, nf4, f4e2m1, f8e8m0, f8e4m3, f8e5m2 to numpy directly a_dtype = DTYPE_MAP_REV[a.get_element_type()] if a_dtype in [ TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ]: diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 2e15dc9a0c5..2e5f48d3f42 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -45,7 +45,7 @@ from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION -from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight +from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_float_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error @@ -1520,7 +1520,7 @@ def test_nf4_quantization_mid_quant(weight, scale): scale = Tensor(scale) # norm_weight equals -0.8480964 (one bit away from the first NF4 quantile center) norm_weight = _calculate_normalized_weight(weight, scale) - nf4_quant = _calculate_nf4_quantized_weight(norm_weight) + nf4_quant = _calculate_float_quantized_weight(norm_weight) norm_weight_ov_backend = Tensor(ov.Tensor(norm_weight.data, norm_weight.shape, ov.Type.f32)) ref_nf4_quant = norm_weight_ov_backend.astype(TensorDataType.nf4).as_numpy_tensor() diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index fcbb127c8d3..40419962363 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -70,6 +70,8 @@ class QuantizationTask(Enum): FP4_COMPRESSION_CONFIGS = [ WeightCompressionConfig(CompressWeightsMode.NF4), WeightCompressionConfig(CompressWeightsMode.NF4, group_size=2), + WeightCompressionConfig(CompressWeightsMode.E2M1), + WeightCompressionConfig(CompressWeightsMode.E2M1, group_size=2), ] COMPRESSION_CONFIGS = INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS @@ -149,18 +151,19 @@ def test_optimized_compression_is_disabled(weight_shape, is_disabled, quantizati reason="Due to a bug in CPU plugin compression models can fail at compilation on ARM CPUs. Ticket: 164135.", ) @pytest.mark.parametrize("weight_shape", [WEIGHT_SHAPE], ids=[""]) -@pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) +# @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) +@pytest.mark.parametrize("config", FP4_COMPRESSION_CONFIGS[-2:]) @pytest.mark.parametrize( ("quantization_task", "tensor_backend"), [ (QuantizationTask.Q, TensorBackend.numpy), - (QuantizationTask.Q, "auto"), + # (QuantizationTask.Q, "auto"), # NumPy backend should support OV tensors as inputs only for quantization task - (QuantizationTask.Q, TensorBackend.ov), - (QuantizationTask.Q_DQ, TensorBackend.numpy), - (QuantizationTask.Q_DQ, "auto"), - (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy), - (QuantizationTask.Q_DQ_RQ, "auto"), + # (QuantizationTask.Q, TensorBackend.ov), + # (QuantizationTask.Q_DQ, TensorBackend.numpy), + # (QuantizationTask.Q_DQ, "auto"), + # (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy), + # (QuantizationTask.Q_DQ_RQ, "auto"), ], ) @pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]) @@ -240,6 +243,10 @@ def test_quantization_alignment(weight_shape, config, quantization_task, tensor_ np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0) # Save results for comparison between implementations + if group_size != -1 and not precompute_s_zp: + weight, _ = reshape_weight_for_grouped_quantization(weight, REDUCTION_AXES, group_size) + results[cb]["input"] = weight.as_numpy_tensor() + if quantization_task != QuantizationTask.Q: results[cb]["decompressed_weight"] = decompressed_weight if quantization_task != QuantizationTask.Q_DQ: @@ -361,9 +368,9 @@ def get_input_node_data(node: ov.Node, input_id: int) -> Tensor: or compression_kwargs.get("lora_correction") ) - if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM]: + if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM, CompressWeightsMode.E2M1]: if is_data_aware: - pytest.skip("Data-aware compression is not supported for INT8 modes.") + pytest.skip("Data-aware compression is not supported for INT8 or F4E2M1 modes.") else: compression_kwargs["all_layers"] = True @@ -455,12 +462,12 @@ def _check_backends_and_dtypes( and config.num_bits == 4 ): # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed - # zero point must be in ov backend and have (u)int4 or nf4 dtypes in order to be able to insert them into OV - # model without re-packing + # zero point must be in ov backend and have (u)int4/nf4/f4e2m1 dtypes in order to be able to insert them into + # OV model without re-packing if config.is_integer: ref_dtype = TensorDataType.uint4 if config.is_asym_mode else TensorDataType.int4 else: - ref_dtype = TensorDataType.nf4 + ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 assert compressed_weight.backend == TensorBackend.ov assert compressed_weight.dtype == ref_dtype if config.is_asym_mode and not precompute_s_zp: @@ -469,14 +476,17 @@ def _check_backends_and_dtypes( else: if quantization_task != QuantizationTask.Q_DQ: # Otherwise, for integer compression, compressed weight and zero point must be returned in numpy backend, - # compressed weight must be of (u)int8, zero point -- in int32; for nf4 compression, the resulting + # compressed weight must be of (u)int8, zero point -- in int32; for nf4/f4e2m1 compression, the resulting # data type and backend depends on the input tensor backend. if config.is_integer: ref_backend = TensorBackend.numpy ref_dtype = TensorDataType.uint8 if config.is_asym_mode else TensorDataType.int8 else: ref_backend = weight_tensor_backend - ref_dtype = TensorDataType.nf4 if weight_tensor_backend == TensorBackend.ov else TensorDataType.float32 + if weight_tensor_backend == TensorBackend.ov: + ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 + else: + ref_dtype = TensorDataType.float32 assert compressed_weight.backend == ref_backend assert compressed_weight.dtype == ref_dtype if config.is_asym_mode and not precompute_s_zp: @@ -488,6 +498,9 @@ def _check_backends_and_dtypes( def _check_values(results): + def format_list_of_floats(lst): + return ", ".join(f"{x:.6f}" for x in lst) + # Check that the computed tensors are equal between implementations keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) for key in keys: @@ -501,6 +514,23 @@ def _check_values(results): # Note: For static-shaped OV models doing asymmetric compression with convertable divisions there maybe # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511. - np.testing.assert_allclose( - ov_result.data, numpy_result.data, atol=0, rtol=0, err_msg=f"Results do not align for {key}." - ) + try: + np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=0, rtol=0) + except AssertionError: + not_equal_mask = ov_result.data != numpy_result.data + msg = ( + f"Results do not align for {key} with " + f"{not_equal_mask.sum() / ov_result.data.size * 100:.2f} % misalignment ratio.\n" + f"OV result: {format_list_of_floats(ov_result.data[not_equal_mask])}\n" + f"NumPy result: {format_list_of_floats(numpy_result.data[not_equal_mask])}\n" + ) + if "input" in results[ComputationBackend.OV] and "input" in results[ComputationBackend.NumPy]: + numpy_input = results[ComputationBackend.NumPy]['input'].data + ov_input = results[ComputationBackend.OV]['input'].data + np.testing.assert_allclose(numpy_input, ov_input, atol=0, rtol=0) + msg += f"Input values : {format_list_of_floats(numpy_input[not_equal_mask])}\n" + misaligned_groups_mask = np.any(not_equal_mask, axis=-1) + misaligned_groups = numpy_input[misaligned_groups_mask, ...] + misaligned_groups = np.reshape(misaligned_groups, (-1, misaligned_groups.shape[-1])) + msg += f"First 10 misaligned groups: {[it for it in misaligned_groups][:10]}\n" + raise AssertionError(msg) From b2e090c5ba8267c4b81036522fc9d5766687f6bb Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 22 Jul 2025 16:26:23 +0200 Subject: [PATCH 02/21] Add round to nearest logic for numpy case --- .../weight_compression/weight_lowering.py | 14 +++++++++++++- nncf/tensor/functions/__init__.py | 1 + nncf/tensor/functions/numeric.py | 12 ++++++++++++ nncf/tensor/functions/numpy_numeric.py | 5 +++++ nncf/tensor/functions/tf_numeric.py | 6 ++++++ nncf/tensor/functions/torch_numeric.py | 5 +++++ nncf/tensor/tensor.py | 3 +++ .../test_compression_functions.py | 19 +++++++++---------- 8 files changed, 54 insertions(+), 11 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8a3aff73c16..4af25647b79 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -239,7 +239,6 @@ def do_float_quantization( compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype) else: compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode) - # compressed_weight = norm_weight return compressed_weight, scale @@ -554,6 +553,19 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeights quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend) indexes = fns.searchsorted(quantile_centers, norm_weight) quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend) + + if mode == CompressWeightsMode.E2M1: + # Round to the nearest even quantile + shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1) + left = quantiles[indexes] + right = quantiles[shifted_indexes] + dist_left = fns.abs(norm_weight - left) + dist_right = fns.abs(norm_weight - right) + choose_right = fns.logical_or( + dist_right < dist_left, fns.logical_and(dist_left == dist_right, (shifted_indexes + 1) % 2 == 0) + ) + indexes = fns.where(choose_right, shifted_indexes, indexes) + quantized_weight = quantiles[indexes] return quantized_weight diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py index ffb05b430d2..468e9f2c3c7 100644 --- a/nncf/tensor/functions/__init__.py +++ b/nncf/tensor/functions/__init__.py @@ -36,6 +36,7 @@ from nncf.tensor.functions.numeric import isempty as isempty from nncf.tensor.functions.numeric import item as item from nncf.tensor.functions.numeric import log2 as log2 +from nncf.tensor.functions.numeric import logical_and as logical_and from nncf.tensor.functions.numeric import logical_or as logical_or from nncf.tensor.functions.numeric import masked_mean as masked_mean from nncf.tensor.functions.numeric import masked_median as masked_median diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py index 886ea3e8ab2..417cf9b0387 100644 --- a/nncf/tensor/functions/numeric.py +++ b/nncf/tensor/functions/numeric.py @@ -612,6 +612,18 @@ def logical_or(x1: Tensor, x2: Tensor) -> Tensor: """ +@tensor_dispatcher +def logical_and(x1: Tensor, x2: Tensor) -> Tensor: + """ + Computes the element-wise logical AND of the given input tensors. + Zeros are treated as False and nonzeros are treated as True. + + :param x1: The input tensor. + :param x2: The tensor to compute and with. + :return: Result of elementwise and operation between input_ and other tensor. + """ + + @tensor_dispatcher def masked_mean(x: Tensor, mask: Tensor, axis: T_AXIS, keepdims: bool = False) -> Tensor: """ diff --git a/nncf/tensor/functions/numpy_numeric.py b/nncf/tensor/functions/numpy_numeric.py index b6accc34c82..bb59f1f85ec 100644 --- a/nncf/tensor/functions/numpy_numeric.py +++ b/nncf/tensor/functions/numpy_numeric.py @@ -348,6 +348,11 @@ def _(x1: T_NUMPY_ARRAY, x2: T_NUMPY_ARRAY) -> T_NUMPY_ARRAY: return np.logical_or(x1, x2) +@numeric.logical_and.register +def _(x1: T_NUMPY_ARRAY, x2: T_NUMPY_ARRAY) -> T_NUMPY_ARRAY: + return np.logical_and(x1, x2) + + @numeric.masked_mean.register def _( x: T_NUMPY_ARRAY, diff --git a/nncf/tensor/functions/tf_numeric.py b/nncf/tensor/functions/tf_numeric.py index be3348ade5e..04b866f8344 100644 --- a/nncf/tensor/functions/tf_numeric.py +++ b/nncf/tensor/functions/tf_numeric.py @@ -417,6 +417,12 @@ def _(x1: tf.Tensor, x2: tf.Tensor) -> tf.Tensor: return tf.logical_or(x1, x2) +@numeric.logical_and.register +def _(x1: tf.Tensor, x2: tf.Tensor) -> tf.Tensor: + with tf.device(x1.device): + return tf.logical_and(x1, x2) + + @numeric.masked_mean.register def _( x: tf.Tensor, mask: Optional[tf.Tensor], axis: Optional[Union[int, tuple[int, ...]]], keepdims: bool = False diff --git a/nncf/tensor/functions/torch_numeric.py b/nncf/tensor/functions/torch_numeric.py index 41ba6e89135..a2544376b92 100644 --- a/nncf/tensor/functions/torch_numeric.py +++ b/nncf/tensor/functions/torch_numeric.py @@ -366,6 +366,11 @@ def _(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: return torch.logical_or(x1, x2) +@numeric.logical_and.register +def _(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + return torch.logical_and(x1, x2) + + @numeric.masked_mean.register def _(x: torch.Tensor, mask: Optional[torch.Tensor], axis: T_AXIS, keepdims: bool = False) -> torch.Tensor: if mask is None: diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py index 1972e55ec4b..9c06f69e9f4 100644 --- a/nncf/tensor/tensor.py +++ b/nncf/tensor/tensor.py @@ -144,6 +144,9 @@ def __ifloordiv__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: self._data //= unwrap_tensor_data(other) return self + def __mod__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return cast(Tensor, _call_function("_binary_op_nowarn", self, other, operator.mod)) + def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: return Tensor(self.data @ unwrap_tensor_data(other)) diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 40419962363..9abe4e802e3 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -151,19 +151,18 @@ def test_optimized_compression_is_disabled(weight_shape, is_disabled, quantizati reason="Due to a bug in CPU plugin compression models can fail at compilation on ARM CPUs. Ticket: 164135.", ) @pytest.mark.parametrize("weight_shape", [WEIGHT_SHAPE], ids=[""]) -# @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) -@pytest.mark.parametrize("config", FP4_COMPRESSION_CONFIGS[-2:]) +@pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS]) @pytest.mark.parametrize( ("quantization_task", "tensor_backend"), [ (QuantizationTask.Q, TensorBackend.numpy), - # (QuantizationTask.Q, "auto"), + (QuantizationTask.Q, "auto"), # NumPy backend should support OV tensors as inputs only for quantization task - # (QuantizationTask.Q, TensorBackend.ov), - # (QuantizationTask.Q_DQ, TensorBackend.numpy), - # (QuantizationTask.Q_DQ, "auto"), - # (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy), - # (QuantizationTask.Q_DQ_RQ, "auto"), + (QuantizationTask.Q, TensorBackend.ov), + (QuantizationTask.Q_DQ, TensorBackend.numpy), + (QuantizationTask.Q_DQ, "auto"), + (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy), + (QuantizationTask.Q_DQ_RQ, "auto"), ], ) @pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]) @@ -525,8 +524,8 @@ def format_list_of_floats(lst): f"NumPy result: {format_list_of_floats(numpy_result.data[not_equal_mask])}\n" ) if "input" in results[ComputationBackend.OV] and "input" in results[ComputationBackend.NumPy]: - numpy_input = results[ComputationBackend.NumPy]['input'].data - ov_input = results[ComputationBackend.OV]['input'].data + numpy_input = results[ComputationBackend.NumPy]["input"].data + ov_input = results[ComputationBackend.OV]["input"].data np.testing.assert_allclose(numpy_input, ov_input, atol=0, rtol=0) msg += f"Input values : {format_list_of_floats(numpy_input[not_equal_mask])}\n" misaligned_groups_mask = np.any(not_equal_mask, axis=-1) From a3459840041139acea960aa1b5b5c2fadee875d2 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Jul 2025 10:39:20 +0200 Subject: [PATCH 03/21] Tweaks --- .../openvino/optimized_functions/functions.py | 2 ++ .../weight_compression/weight_lowering.py | 14 ++++------- src/nncf/tensor/functions/__init__.py | 1 - src/nncf/tensor/functions/numeric.py | 12 --------- src/nncf/tensor/functions/numpy_numeric.py | 5 ---- src/nncf/tensor/functions/tf_numeric.py | 6 ----- src/nncf/tensor/functions/torch_numeric.py | 5 ---- src/nncf/tensor/tensor.py | 6 +++++ src/nncf/version.py | 2 +- .../template_test_nncf_tensor.py | 25 +++++++++++++++++++ .../quantization/test_weights_compression.py | 2 +- 11 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index a7b3e3a0a64..ba548afb2db 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -108,6 +108,8 @@ def do_float_quantization( ) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding float weight quantization. + For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. + For E2M1 quantization quantizes the weights to 16 levels on [-6, 6] interval. :param weight: Weight array to compress. :param config: Weight compression configuration. diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 65dd6832557..95693f122ab 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -504,7 +504,7 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeights Performs float (currently NF4 or F4E2M1) quantization. Look-up table is used to "round" or "quantize" to the closest quant. - :param norm_weight: Weight tensor to quantize. + :param norm_weight: Normalized weight tensor to quantize. :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants. """ assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1] @@ -515,15 +515,11 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeights quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend) if mode == CompressWeightsMode.E2M1: - # Round to the nearest even quantile + # If in-between two quantiles, round to the nearest even quantile. shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1) - left = quantiles[indexes] - right = quantiles[shifted_indexes] - dist_left = fns.abs(norm_weight - left) - dist_right = fns.abs(norm_weight - right) - choose_right = fns.logical_or( - dist_right < dist_left, fns.logical_and(dist_left == dist_right, (shifted_indexes + 1) % 2 == 0) - ) + dist_left = fns.abs(norm_weight - quantiles[indexes]) + dist_right = fns.abs(norm_weight - quantiles[shifted_indexes]) + choose_right = (dist_right < dist_left) | ((dist_left == dist_right) & ((shifted_indexes + 1) % 2 == 0)) indexes = fns.where(choose_right, shifted_indexes, indexes) quantized_weight = quantiles[indexes] diff --git a/src/nncf/tensor/functions/__init__.py b/src/nncf/tensor/functions/__init__.py index 468e9f2c3c7..ffb05b430d2 100644 --- a/src/nncf/tensor/functions/__init__.py +++ b/src/nncf/tensor/functions/__init__.py @@ -36,7 +36,6 @@ from nncf.tensor.functions.numeric import isempty as isempty from nncf.tensor.functions.numeric import item as item from nncf.tensor.functions.numeric import log2 as log2 -from nncf.tensor.functions.numeric import logical_and as logical_and from nncf.tensor.functions.numeric import logical_or as logical_or from nncf.tensor.functions.numeric import masked_mean as masked_mean from nncf.tensor.functions.numeric import masked_median as masked_median diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 417cf9b0387..886ea3e8ab2 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -612,18 +612,6 @@ def logical_or(x1: Tensor, x2: Tensor) -> Tensor: """ -@tensor_dispatcher -def logical_and(x1: Tensor, x2: Tensor) -> Tensor: - """ - Computes the element-wise logical AND of the given input tensors. - Zeros are treated as False and nonzeros are treated as True. - - :param x1: The input tensor. - :param x2: The tensor to compute and with. - :return: Result of elementwise and operation between input_ and other tensor. - """ - - @tensor_dispatcher def masked_mean(x: Tensor, mask: Tensor, axis: T_AXIS, keepdims: bool = False) -> Tensor: """ diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index bb59f1f85ec..b6accc34c82 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -348,11 +348,6 @@ def _(x1: T_NUMPY_ARRAY, x2: T_NUMPY_ARRAY) -> T_NUMPY_ARRAY: return np.logical_or(x1, x2) -@numeric.logical_and.register -def _(x1: T_NUMPY_ARRAY, x2: T_NUMPY_ARRAY) -> T_NUMPY_ARRAY: - return np.logical_and(x1, x2) - - @numeric.masked_mean.register def _( x: T_NUMPY_ARRAY, diff --git a/src/nncf/tensor/functions/tf_numeric.py b/src/nncf/tensor/functions/tf_numeric.py index 04b866f8344..be3348ade5e 100644 --- a/src/nncf/tensor/functions/tf_numeric.py +++ b/src/nncf/tensor/functions/tf_numeric.py @@ -417,12 +417,6 @@ def _(x1: tf.Tensor, x2: tf.Tensor) -> tf.Tensor: return tf.logical_or(x1, x2) -@numeric.logical_and.register -def _(x1: tf.Tensor, x2: tf.Tensor) -> tf.Tensor: - with tf.device(x1.device): - return tf.logical_and(x1, x2) - - @numeric.masked_mean.register def _( x: tf.Tensor, mask: Optional[tf.Tensor], axis: Optional[Union[int, tuple[int, ...]]], keepdims: bool = False diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index a2544376b92..41ba6e89135 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -366,11 +366,6 @@ def _(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: return torch.logical_or(x1, x2) -@numeric.logical_and.register -def _(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - return torch.logical_and(x1, x2) - - @numeric.masked_mean.register def _(x: torch.Tensor, mask: Optional[torch.Tensor], axis: T_AXIS, keepdims: bool = False) -> torch.Tensor: if mask is None: diff --git a/src/nncf/tensor/tensor.py b/src/nncf/tensor/tensor.py index 9c06f69e9f4..c8a05bc2c05 100644 --- a/src/nncf/tensor/tensor.py +++ b/src/nncf/tensor/tensor.py @@ -84,6 +84,12 @@ def __len__(self) -> int: # built-in operations + def __or__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return Tensor(self.data | unwrap_tensor_data(other)) + + def __and__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: + return Tensor(self.data & unwrap_tensor_data(other)) + def __add__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: return Tensor(self.data + unwrap_tensor_data(other)) diff --git a/src/nncf/version.py b/src/nncf/version.py index 43b1de909db..85d53def50f 100644 --- a/src/nncf/version.py +++ b/src/nncf/version.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.18.0" +__version__ = "2.18.0.dev0+66c0366aedirty" BKC_TORCH_SPEC = "==2.7.*" diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index f193be28b97..6015674c4a3 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -48,6 +48,8 @@ } BINARY_OPERATORS = ["add", "sub", "pow", "mul", "truediv", "floordiv"] +BOOLEAN_OPERATOR_MAP = {"and": operator.and_, "or": operator.or_} + COMPARISON_OPERATOR_MAP = { "lt": operator.lt, "le": operator.le, @@ -98,6 +100,25 @@ def test_operator_clone(self): assert id(tensor_a.data) is not id(tensor_b.data) assert all(tensor_a == tensor_b) + @pytest.mark.parametrize("op_name", BOOLEAN_OPERATOR_MAP.keys()) + @pytest.mark.parametrize("value", [True, False]) + def test_operators_bool(self, op_name, value): + tensor_a = self.to_tensor([True, False]) + + nncf_tensor_a = Tensor(tensor_a) + + fn = BOOLEAN_OPERATOR_MAP[op_name] + res = fn(tensor_a, value) + res_nncf = fn(nncf_tensor_a, value) + + assert res.dtype == res_nncf.data.dtype + assert all(res == res_nncf.data) + assert isinstance(res_nncf, Tensor) + if ( + self.backend() != TensorBackend.tf + ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + assert res_nncf.device == nncf_tensor_a.device + @pytest.mark.parametrize("op_name", OPERATOR_MAP.keys()) def test_operators_tensor(self, op_name): tensor_a = self.to_tensor([1.0, 2.0]) @@ -1982,6 +2003,8 @@ def test_fn_zeros(self): TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] @@ -2014,6 +2037,8 @@ def test_fn_eye(self, n, m, ref): TensorDataType.int4, TensorDataType.uint4, TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e8m0, TensorDataType.f8e4m3, TensorDataType.f8e5m2, ] diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b9d364a0d37..f4f67ae5f95 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1763,7 +1763,7 @@ def test_nf4_quantization_mid_quant(weight, scale): scale = Tensor(scale) # norm_weight equals -0.8480964 (one bit away from the first NF4 quantile center) norm_weight = _calculate_normalized_weight(weight, scale) - nf4_quant = _calculate_float_quantized_weight(norm_weight) + nf4_quant = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.NF4) norm_weight_ov_backend = Tensor(ov.Tensor(norm_weight.data, norm_weight.shape, ov.Type.f32)) ref_nf4_quant = norm_weight_ov_backend.astype(TensorDataType.nf4).as_numpy_tensor() From 6e3ba6ef54b71f81ec68311cfc2c03741fcded1b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 29 Jul 2025 10:39:34 +0200 Subject: [PATCH 04/21] Temporarily install OV nightly --- .github/workflows/call_precommit.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/call_precommit.yml b/.github/workflows/call_precommit.yml index 3931cb5e792..73c5a7b639d 100644 --- a/.github/workflows/call_precommit.yml +++ b/.github/workflows/call_precommit.yml @@ -91,6 +91,8 @@ jobs: shell: bash - name: Install NNCF and test requirements run: pip install . -r tests/openvino/requirements.txt + - name: Install OpenVINO nightly + run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Print installed modules run: pip list - name: Run OV precommit test scope From 7555794b545f012237b2a12323a2556120a4680a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 21 Aug 2025 10:51:30 +0200 Subject: [PATCH 05/21] Update src/nncf/version.py --- src/nncf/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/version.py b/src/nncf/version.py index 85d53def50f..43b1de909db 100644 --- a/src/nncf/version.py +++ b/src/nncf/version.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.18.0.dev0+66c0366aedirty" +__version__ = "2.18.0" BKC_TORCH_SPEC = "==2.7.*" From 83770e1fafefa59327be5b087ec056eb81089982 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Mon, 22 Sep 2025 16:53:24 +0200 Subject: [PATCH 06/21] [OpenVINO][WC] E5M2 and E4M3 FP8 weights compression support --- .../openvino/optimized_functions/functions.py | 4 +-- .../openvino/optimized_functions/models.py | 2 +- src/nncf/parameters.py | 7 ++++ .../weight_compression/algorithm.py | 4 +++ .../algorithms/weight_compression/config.py | 2 ++ .../weight_compression/openvino_backend.py | 6 ++++ .../weight_compression/torch_backend.py | 2 ++ .../weight_compression/torch_fx_backend.py | 2 ++ .../weight_compression/weight_lowering.py | 34 +++++++++++++------ src/nncf/quantization/quantize_model.py | 20 ++++++++--- .../quantization/test_weights_compression.py | 29 ++++++++++------ .../quantization/test_weights_compression.py | 7 +++- 12 files changed, 88 insertions(+), 31 deletions(-) diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index e22ea481abd..d2cf4481d93 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -109,13 +109,13 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved + TODO(nikita-savelyevv): add support for E2M1, E5M2 and E4M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized (for e2m1, e5m2 and e4m3 normalized) weight tensor and corresponding scale tensor. """ assert config.mode == CompressWeightsMode.NF4 diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 8400f04b4fe..c94dfc53757 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -597,7 +597,7 @@ def _build_float_quantization_model( ) # Validate output dtypes - # TODO: add support for f4e2m1 once ticket 164851 is resolved + # TODO: add support for f4e2m1, f8e5m2 and f8e4m3 once ticket 164851 is resolved valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4] if compressed_weight_dtype not in valid_compressed_weight_dtypes: msg = ( diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 09265dcf9ca..19ff01a0561 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -90,6 +90,11 @@ class CompressWeightsMode(StrEnum): :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + 2 bist exponent, 1 bit mantissa. + :param E5M2: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + 5 bits expotent, 2 bits mantissa. + :param E4M3: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + 4 bits expotent, 3 bits mantissa. :param CODEBOOK: Codebook (LUT) quantization format. :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ @@ -102,6 +107,8 @@ class CompressWeightsMode(StrEnum): CB4_F8E4M3 = "cb4_f8e4m3" INT8 = "int8" # Deprecated mode E2M1 = "e2m1" + E5M2 = "e5m2" + E4M3 = "e4m3" CODEBOOK = "codebook" diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index d0407a1eff4..d64568820e6 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -64,6 +64,8 @@ CompressWeightsMode.INT4_ASYM, CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, ] SUPPORTED_DATA_TYPES = [ TensorDataType.float16, @@ -286,6 +288,8 @@ def __init__( with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. E2M1 is the same as INT4_SYM mode, but primary precision is E2M1 data type without zero point. + E5M2 is the same as INT8_SYM mode, but primary precision is E5M2 data type without zero point. + E5M3 is the same as INT8_SYM mode, but primary precision is E5M3 data type without zero point. :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to backup_mode). :param group_size: number of weights (e.g. 128) in the channel dimension diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index 0f771d11b3e..3cfeff56ff0 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -60,6 +60,8 @@ def is_integer(self): return self.mode not in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ] diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 01a6413cb2f..0bc00de957b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -228,6 +228,12 @@ def _create_compression_subgraph( elif compression_config.mode == CompressWeightsMode.E2M1: compression_dtype = ov.Type.f4e2m1 scale_dtype = ov.Type.f8e8m0 + elif compression_config.mode == CompressWeightsMode.E5M2: + compression_dtype = ov.Type.f8e5m2 + scale_dtype = ov.Type.f8e8m0 + elif compression_config.mode == CompressWeightsMode.E4M3: + compression_dtype = ov.Type.f8e4m3 + scale_dtype = ov.Type.f8e8m0 elif compression_config.mode == CompressWeightsMode.INT4_SYM: compression_dtype = ov.Type.i4 elif compression_config.mode == CompressWeightsMode.INT4_ASYM: diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index 0d91cdc3d03..f73cd8d86b5 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -436,6 +436,8 @@ def transform_model( if compression_config.mode in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index 09260cfc56e..a66cbc13669 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -179,6 +179,8 @@ def transform_model( if compression_config.mode in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index a08dd1340a8..8deac87d3cb 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -81,7 +81,7 @@ def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ - Calculates the scale for nf4 or e2m1 quantization. + Calculates the scale for nf4 or e2m1/e5m2/e4m3 quantization. :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). @@ -94,15 +94,27 @@ def calculate_float_quantization_params( weight = weight.astype(TensorDataType.float32) scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) - if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]: - max_val = 6.0 if config.mode == CompressWeightsMode.E2M1 else fns.max(fns.abs(config.get_numpy_codebook())) + FP_MAX_VALS = { + CompressWeightsMode.E2M1: 6.0, + CompressWeightsMode.E5M2: 57_344.0, + CompressWeightsMode.E4M3: 448.0, + } + if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()): + if config.mode in FP_MAX_VALS: + max_val = FP_MAX_VALS[config.mode] + else: + max_val = fns.max(fns.abs(config.get_numpy_codebook())) scale = scale / max_val # NOTE: adding machine epsilon to avoid division by zero eps = fns.finfo(weight).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) - if config.mode == CompressWeightsMode.E2M1: + if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.E5M2, CompressWeightsMode.E4M3]: + # FP types are using E8M0 type scale. + # It can only contain values [2**(-127), 2**(-126), ..., 2**(126), 2**(127)]. + # Here, we quantize each element of the scale to the smallest possible value greater than or equal to + # the element value to make it possible to convert the float scale value to a FP format without rounding. scale = fns.log2(scale) scale = fns.ceil(scale) scale = fns.clip(scale, -127, 127) @@ -134,16 +146,16 @@ def do_float_quantization( precomputed_scale: Optional[Tensor] = None, ) -> tuple[Tensor, Tensor, Tensor]: """ - Computes quantization scale if not provided, and performs corresponding (nf4, e2m1) weight quantization. + Computes quantization scale if not provided, and performs corresponding (nf4, e2m1, e5m2, e4m3) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For E2M1 and CODEBOOK currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved + For E2M1, E5M2, E4M3 and CODEBOOK currently returns normalized weight without quantization. + TODO(nikita-savelyevv): add support for E2M1, E5M2, E4M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1 normalized) weight tensor and corresponding scale tensor and + :return: Returns quantized (for e2m1, e5m2, e4m3 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ assert not config.is_integer @@ -180,7 +192,7 @@ def do_float_quantization( ) return compressed_weight, scale, indexes else: - # TODO(nikita-savelyevv): add support for E2M1 once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for E2M1, E5M2, E4M3 once ticket 164851 is resolved compressed_weight = norm_weight return compressed_weight, scale, None @@ -194,7 +206,7 @@ def float_quantize_dequantize_weight( ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values. - E2M1 mode is currently not supported. + E2M1, E5M2, E4M3 mode is currently not supported. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -204,7 +216,7 @@ def float_quantize_dequantize_weight( :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] - # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for f4e2m1, e5m2, e4m3, once ticket 164851 is resolved # Optimized implementation if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 320a98253af..9ebd67e3ed3 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -519,10 +519,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -568,10 +570,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { @@ -607,8 +611,12 @@ def compress_weights( msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." raise nncf.ParameterNotSupportedError(msg) - if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1: - msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1." + if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ + CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, + ]: + msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [E2M1, E5M2, E4M3]." raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: @@ -627,10 +635,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "ONNX backend does not support NF4, E2M1 and CODEBOOK modes for weight compression." + msg = "ONNX backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 55d3d929de2..e022195beff 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -799,9 +799,10 @@ def test_raise_error_with_unsupported_params_for_int4(mode, params): "gptq", ), ) -def test_raise_error_with_unsupported_params_for_e2m1(algo): +@pytest.mark.parametrize("mode", [CompressWeightsMode.E2M1, CompressWeightsMode.E5M2, CompressWeightsMode.E4M3]) +def test_raise_error_with_unsupported_params_for_fp(algo, mode): with pytest.raises(nncf.ParameterNotSupportedError): - compress_weights(ov.Model([], []), dataset="anything", mode=CompressWeightsMode.E2M1, **{algo: True}) + compress_weights(ov.Model([], []), dataset="anything", mode=mode, **{algo: True}) @pytest.mark.parametrize("mode", INT4_NF4_MODES) @@ -1098,7 +1099,7 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode): @pytest.mark.parametrize( - ("mode", "all_layers", "ratio", "ref_ids"), + ("sensitivity_metric", "all_layers", "ratio", "ref_ids"), ( (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), @@ -1118,23 +1119,29 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode): (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), ), ) -def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids): +@pytest.mark.parametrize( + "mode, ov_type", + [ + (CompressWeightsMode.E5M2, ov.Type.f8e5m2), + (CompressWeightsMode.E4M3, ov.Type.f8e4m3), + (CompressWeightsMode.E2M1, ov.Type.f4e2m1), + ], +) +def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type): model = SequentialMatmulModel().ov_model dataset = Dataset([np.ones([1, 4, 4]), np.arange(16).reshape(1, 4, 4)]) compressed_model = compress_weights( model, - mode=CompressWeightsMode.E2M1, + mode=mode, ratio=ratio, group_size=1, all_layers=all_layers, - sensitivity_metric=mode, + sensitivity_metric=sensitivity_metric, dataset=dataset, ) - names_e2m1 = { - op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f4e2m1 - } - ref_e2m1_nodes = {f"weights_{i}" for i in ref_ids} - assert ref_e2m1_nodes == names_e2m1 + names_fp = {op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov_type} + ref_fp_nodes = {f"weights_{i}" for i in ref_ids} + assert ref_fp_nodes == names_fp names_e8m0 = { op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov.Type.f8e8m0 diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index f799bc995fa..9f3527106bf 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -50,7 +50,12 @@ INT8_MODES = (CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM) INT4_MODES = (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM) SUPPORTED_MODES = INT8_MODES + INT4_MODES -UNSUPPORTED_MODES = (CompressWeightsMode.NF4, CompressWeightsMode.E2M1) +UNSUPPORTED_MODES = ( + CompressWeightsMode.NF4, + CompressWeightsMode.E2M1, + CompressWeightsMode.E5M2, + CompressWeightsMode.E4M3, +) class SequentialMatmulModel(nn.Module): From 8054217df82ef65ec9670468a264b6bed548c7dd Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Tue, 30 Sep 2025 16:02:27 +0200 Subject: [PATCH 07/21] MXFP4/MXFP8_E4M3 --- .ci/cspell_dict.txt | 1 + docs/Algorithms.md | 3 +- .../weights_compression/Usage.md | 14 +-- .../openvino/optimized_functions/functions.py | 4 +- .../openvino/optimized_functions/models.py | 2 +- src/nncf/parameters.py | 17 ++-- .../weight_compression/algorithm.py | 27 +++--- .../algorithms/weight_compression/config.py | 5 +- .../weight_compression/openvino_backend.py | 7 +- .../weight_compression/torch_backend.py | 5 +- .../weight_compression/torch_fx_backend.py | 5 +- .../weight_compression/weight_lowering.py | 24 ++--- src/nncf/quantization/quantize_model.py | 33 ++++--- tests/openvino/native/models.py | 6 +- .../quantization/test_weights_compression.py | 93 +++++++++++-------- .../quantization/test_weights_compression.py | 5 +- 16 files changed, 131 insertions(+), 120 deletions(-) diff --git a/.ci/cspell_dict.txt b/.ci/cspell_dict.txt index 44977d86464..e475b6f1951 100644 --- a/.ci/cspell_dict.txt +++ b/.ci/cspell_dict.txt @@ -271,6 +271,7 @@ multidevice multiforward multihead multiobjective +mxfp namedtuples nanmean nanquantile diff --git a/docs/Algorithms.md b/docs/Algorithms.md index d393f0c543d..3a8d876c646 100644 --- a/docs/Algorithms.md +++ b/docs/Algorithms.md @@ -11,7 +11,8 @@ - Symmetric 8 bit compression mode - Symmetric and asymmetric 4 bit compression mode - NF4 compression mode - - E2M1 weights with E8M0 scales compression mode + - MXFP4 compression model with E2M1 data type, E8M0 scales and group size == 32 + - MXFP8_E4M3 compression model with E4M3 data type, E8M0 scales and group size == 32 - Mixed precision weights compression - Grouped weights compression diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index bd16fe06ef6..9ba3b6175dc 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -22,8 +22,8 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod ### Supported modes By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode. -OpenVINO backend also supports 4 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM, NF4, E2M1. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. In case of E2M1 mode - [e2m1](https://arxiv.org/pdf/2310.10537) data type without zero point and has 8bit [E8M0](https://arxiv.org/pdf/2310.10537) scale. -All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale). +OpenVINO backend also supports 5 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM, NF4, MXFP4 and MXFP8_E4M3. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. In case of MXFP4 mode - [e2m1](https://arxiv.org/pdf/2310.10537) data type without zero point and has 8bit [E8M0](https://arxiv.org/pdf/2310.10537) scale. In case of MXFP8_E4M3 mode - [e4m3](https://arxiv.org/pdf/2310.10537) data type without zero point and has 8bit [E8M0](https://arxiv.org/pdf/2310.10537) scale. +All 4-bit modes except MXFP4/MXFP8_E4M3 have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale). MXFP4 and MXFP8_E4M3 could be used only with group size == 32. All embeddings, convolutions and last linear layers are always compressed to a backup mode, which is "INT8_ASYM", by default. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`. Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to a backup mode. OpenVINO backend supports 3 backup modes: INT8_SYM, INT8_ASYM, and NONE, which retains the original floating-point precision of the model weights. Backup mode is supported only for mixed-precision weight quantization. @@ -191,13 +191,13 @@ from nncf import compress_weights, CompressWeightsMode compressed_model = compress_weights(model, mode=CompressWeightsMode.NF4) ``` -- `E2M1` mode can be considered for improving accuracy, but currently models quantized to e2m1 should not be faster models - quantized to 8-bit asymmetric integer. Here's the example how to compress weights to e2m1 data type with group size = 32 (recommended). - Different `group_size` and `ratio` are also supported. +- `MXFP4` or `MXFP8_E4M3` modes can be considered for improving accuracy, but currently models quantized to mxfp4/mxfp8_e4m3 should not be faster models + quantized to 8-bit asymmetric integer. Here's the example how to compress weights to mxfp4 data type with constant group size == 32. + Different `ratio` is also supported. ```python from nncf import compress_weights, CompressWeightsMode -compressed_model = compress_weights(model, mode=CompressWeightsMode.E2M1, group_size=32, all_layers=True) +compressed_model = compress_weights(model, mode=CompressWeightsMode.MXFP4, group_size=32, all_layers=True) ``` #### Caching Statistics @@ -672,7 +672,7 @@ Accuracy/footprint trade-off for `microsoft/Phi-3-mini-4k-instruct`: - The compression applies in-place. - The compressed model is not trainable. - INT4_SYM, INT4_ASYM, NF4 and E2M1 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only. -- NF4, E2M1 support is experimental on GPU and NPU - models quantized to nf4/e2m1 should not be faster models quantized to 8-bit integer. +- NF4, MXFP4, MXFP8_E4M3 support is experimental on GPU and NPU - models quantized to nf4/mxfp4/mxfp8_e4m3 should not be faster models quantized to 8-bit integer. ### Additional resources diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index d2cf4481d93..40b65fe07ee 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -109,13 +109,13 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - TODO(nikita-savelyevv): add support for E2M1, E5M2 and E4M3 once ticket 164851 is resolved + TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1, e5m2 and e4m3 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized (for MXFP4 and MXFP8_E3M3 normalized) weight tensor and corresponding scale tensor. """ assert config.mode == CompressWeightsMode.NF4 diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index c94dfc53757..97420969001 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -597,7 +597,7 @@ def _build_float_quantization_model( ) # Validate output dtypes - # TODO: add support for f4e2m1, f8e5m2 and f8e4m3 once ticket 164851 is resolved + # TODO: add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4] if compressed_weight_dtype not in valid_compressed_weight_dtypes: msg = ( diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 19ff01a0561..383e1b40404 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -89,12 +89,12 @@ class CompressWeightsMode(StrEnum): https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. - :param E2M1: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 2 bist exponent, 1 bit mantissa. - :param E5M2: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 5 bits expotent, 2 bits mantissa. - :param E4M3: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 4 bits expotent, 3 bits mantissa. + :param MXFP4: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + 2 bist exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. + Group size is constant == 32. + :param MXFP8_E4M3: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. + 4 bist exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. + Group size is constant == 32. :param CODEBOOK: Codebook (LUT) quantization format. :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ @@ -106,9 +106,8 @@ class CompressWeightsMode(StrEnum): NF4 = "nf4" CB4_F8E4M3 = "cb4_f8e4m3" INT8 = "int8" # Deprecated mode - E2M1 = "e2m1" - E5M2 = "e5m2" - E4M3 = "e4m3" + MXFP4 = "mxfp4" + MXFP8_E4M3 = "mxfp8_e4m3" CODEBOOK = "codebook" diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index d64568820e6..f8585c1fb53 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -63,9 +63,8 @@ CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM, CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, ] SUPPORTED_DATA_TYPES = [ TensorDataType.float16, @@ -95,13 +94,13 @@ def get_weight_compression_configuration( """ Generates a configuration dictionary for weight compression based on the provided parameters. """ - group_size = ( - -1 - if group_size is None and mode in INT8_MODES - else 128 - if group_size is None and mode in NON_INT8_MODES - else group_size - ) + if group_size is None and mode in INT8_MODES: + group_size = -1 + elif group_size is None and mode in NON_INT8_MODES: + if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: + group_size = 32 + else: + group_size = 128 return { "mode": mode, @@ -246,6 +245,9 @@ def check_user_compression_configuration( f"Supported modes are: {[e.value for e in GroupSizeFallbackMode]}." ) raise nncf.ValidationError(msg) + if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] and group_size not in [None, 32]: + msg = f"MXFP4 and MXFP8_E4M3 types only support group size == 32, group size == {group_size} is given" + raise nncf.ValidationError(msg) class WeightCompression(Algorithm): @@ -287,9 +289,8 @@ def __init__( INT4_ASYM is the same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. - E2M1 is the same as INT4_SYM mode, but primary precision is E2M1 data type without zero point. - E5M2 is the same as INT8_SYM mode, but primary precision is E5M2 data type without zero point. - E5M3 is the same as INT8_SYM mode, but primary precision is E5M3 data type without zero point. + MXFP4 has E2M1 weight dtype with E8M0 scale, group size 32 and no zero point. + MXFP8_E4M3 has E4M3 weight dtype with E8M0 scale, group size 32 and no zero point. :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to backup_mode). :param group_size: number of weights (e.g. 128) in the channel dimension diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index 3cfeff56ff0..b7c5f032f03 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -59,9 +59,8 @@ def is_integer(self): """ return self.mode not in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ] diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 0bc00de957b..9571c1d726b 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -225,13 +225,10 @@ def _create_compression_subgraph( scale_dtype = ov.Type.f16 if compression_config.mode == CompressWeightsMode.NF4: compression_dtype = ov.Type.nf4 - elif compression_config.mode == CompressWeightsMode.E2M1: + elif compression_config.mode == CompressWeightsMode.MXFP4: compression_dtype = ov.Type.f4e2m1 scale_dtype = ov.Type.f8e8m0 - elif compression_config.mode == CompressWeightsMode.E5M2: - compression_dtype = ov.Type.f8e5m2 - scale_dtype = ov.Type.f8e8m0 - elif compression_config.mode == CompressWeightsMode.E4M3: + elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3: compression_dtype = ov.Type.f8e4m3 scale_dtype = ov.Type.f8e8m0 elif compression_config.mode == CompressWeightsMode.INT4_SYM: diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py index f73cd8d86b5..699c4c6f098 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -435,9 +435,8 @@ def transform_model( compression_config = wc_params.compression_config if compression_config.mode in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py index a66cbc13669..215e625b95f 100644 --- a/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/torch_fx_backend.py @@ -178,9 +178,8 @@ def transform_model( compression_config = wc_params.compression_config if compression_config.mode in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, ]: msg = f"{compression_config.mode.value} is not supported." raise nncf.ParameterNotSupportedError(msg) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8deac87d3cb..8d2eb5729a8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -81,7 +81,7 @@ def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ - Calculates the scale for nf4 or e2m1/e5m2/e4m3 quantization. + Calculates the scale for nf4 or mxfp4/mxfp8_e3m3 quantization. :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). @@ -95,9 +95,8 @@ def calculate_float_quantization_params( scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) FP_MAX_VALS = { - CompressWeightsMode.E2M1: 6.0, - CompressWeightsMode.E5M2: 57_344.0, - CompressWeightsMode.E4M3: 448.0, + CompressWeightsMode.MXFP4: 6.0, + CompressWeightsMode.MXFP8_E4M3: 448.0, } if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()): if config.mode in FP_MAX_VALS: @@ -110,7 +109,7 @@ def calculate_float_quantization_params( eps = fns.finfo(weight).eps scale = fns.where(fns.abs(scale) < eps, eps, scale) - if config.mode in [CompressWeightsMode.E2M1, CompressWeightsMode.E5M2, CompressWeightsMode.E4M3]: + if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: # FP types are using E8M0 type scale. # It can only contain values [2**(-127), 2**(-126), ..., 2**(126), 2**(127)]. # Here, we quantize each element of the scale to the smallest possible value greater than or equal to @@ -146,16 +145,17 @@ def do_float_quantization( precomputed_scale: Optional[Tensor] = None, ) -> tuple[Tensor, Tensor, Tensor]: """ - Computes quantization scale if not provided, and performs corresponding (nf4, e2m1, e5m2, e4m3) weight quantization. + Computes quantization scale if not provided, + and performs corresponding (nf4, MXFP4 and MXFP8_E3M3) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For E2M1, E5M2, E4M3 and CODEBOOK currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for E2M1, E5M2, E4M3 once ticket 164851 is resolved + For MXFP4, MXFP8_E3M3 and CODEBOOK currently returns normalized weight without quantization. + TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for e2m1, e5m2, e4m3 normalized) weight tensor and corresponding scale tensor and + :return: Returns quantized (for MXFP4 and MXFP8_E3M3 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ assert not config.is_integer @@ -192,7 +192,7 @@ def do_float_quantization( ) return compressed_weight, scale, indexes else: - # TODO(nikita-savelyevv): add support for E2M1, E5M2, E4M3 once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved compressed_weight = norm_weight return compressed_weight, scale, None @@ -206,7 +206,7 @@ def float_quantize_dequantize_weight( ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values. - E2M1, E5M2, E4M3 mode is currently not supported. + MXFP4 and MXFP8_E3M3 mode is currently not supported. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -216,7 +216,7 @@ def float_quantize_dequantize_weight( :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] - # TODO(nikita-savelyevv): add support for f4e2m1, e5m2, e4m3, once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3, once ticket 164851 is resolved # Optimized implementation if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 9ebd67e3ed3..be01081256a 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -458,7 +458,8 @@ def compress_weights( INT4_ASYM is the same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. - E2M1 is the same as INT4_SYM mode, but primary precision is E2M1 data type without zero point. + MXFP4 has E2M1 data dtype with E8M0 scale, group size 32 and no zero point. + MXFP8_E4M3 has E4M3 data dtype with E8M0 scale, group size 32 and no zero point. :type mode: nncf.CompressWeightsMode :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to INT8_ASYM). @@ -518,13 +519,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -569,13 +569,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { @@ -612,11 +611,12 @@ def compress_weights( raise nncf.ParameterNotSupportedError(msg) if any((awq, scale_estimation, gptq, lora_correction)) and mode in [ - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, ]: - msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [E2M1, E5M2, E4M3]." + msg = ( + "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E3M3]." + ) raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: @@ -634,13 +634,12 @@ def compress_weights( if mode in [ CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "ONNX backend does not support NF4, E2M1, E5M2, E4M3 and CODEBOOK modes for weight compression." + msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index 2ba8e2f8586..765b1a672b9 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -793,13 +793,13 @@ class SequentialMatmulModel(OVReferenceModel): rel_error= 0.03 """ - def _create_ov_model(self): - input_node = opset.parameter([1, 4, 4], name="Input_1") + def _create_ov_model(self, mm_hidden_dim=4): + input_node = opset.parameter([1, 4, mm_hidden_dim], name="Input_1") main_values = [10000, 1000, 1, 10, 10000] last_node = input_node for i, main_value in enumerate(main_values): - weights_data = np.arange(0, 16).reshape(4, 4) + weights_data = np.arange(0, mm_hidden_dim**2).reshape(mm_hidden_dim, mm_hidden_dim) weights_data[-1, -1] = main_value current_weights = opset.constant(weights_data, dtype=np.float32, name=f"weights_{i}") current_node = opset.matmul( diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index e022195beff..6d1eb3e1762 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -790,19 +790,24 @@ def test_raise_error_with_unsupported_params_for_int4(mode, params): compress_weights(ov.Model([], []), mode=mode, **params) -@pytest.mark.parametrize( - "algo", - ( - "lora_correction", - "awq", - "scale_estimation", - "gptq", - ), -) -@pytest.mark.parametrize("mode", [CompressWeightsMode.E2M1, CompressWeightsMode.E5M2, CompressWeightsMode.E4M3]) -def test_raise_error_with_unsupported_params_for_fp(algo, mode): - with pytest.raises(nncf.ParameterNotSupportedError): - compress_weights(ov.Model([], []), dataset="anything", mode=mode, **{algo: True}) +@pytest.mark.parametrize("mode", [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]) +class TestUnsupportedParamsMXFP: + @pytest.mark.parametrize( + "algo", + ( + "lora_correction", + "awq", + "scale_estimation", + "gptq", + ), + ) + def test_raise_error_with_unsupported_algo_for_mx(self, algo, mode): + with pytest.raises(nncf.ParameterNotSupportedError): + compress_weights(ov.Model([], []), dataset="anything", mode=mode, **{algo: True}) + + def test_raise_error_with_unsupported_group_size_for_fp(self, mode): + with pytest.raises(nncf.ValidationError): + compress_weights(ov.Model([], []), dataset="anything", mode=mode, group_size=64) @pytest.mark.parametrize("mode", INT4_NF4_MODES) @@ -1099,47 +1104,59 @@ def test_call_gptq_with_dataset_scale_estimation_neg_group_size(mode): @pytest.mark.parametrize( - ("sensitivity_metric", "all_layers", "ratio", "ref_ids"), + ("sensitivity_metric", "all_layers", "ratio", "ref_ids", "group_size"), ( - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 3, 4]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, []), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 3]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [0]), - (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, []), - (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2]), - (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2]), - (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2]), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 1, [0, 1, 2, 3, 4], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.4, [1], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, True, 0.2, [], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 1, [0, 1, 2, 3], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.4, [1], None), + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR, False, 0.2, [], None), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.HESSIAN_INPUT_ACTIVATION, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MAX_ACTIVATION_VARIANCE, False, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, True, 0.8, [0, 1, 2], None), + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], None), + # One test to check manual group size setup is working as expected + (SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, False, 0.8, [0, 1, 2], 32), ), ) @pytest.mark.parametrize( "mode, ov_type", [ - (CompressWeightsMode.E5M2, ov.Type.f8e5m2), - (CompressWeightsMode.E4M3, ov.Type.f8e4m3), - (CompressWeightsMode.E2M1, ov.Type.f4e2m1), + (CompressWeightsMode.MXFP8_E4M3, ov.Type.f8e4m3), + (CompressWeightsMode.MXFP4, ov.Type.f4e2m1), ], ) -def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type): - model = SequentialMatmulModel().ov_model - dataset = Dataset([np.ones([1, 4, 4]), np.arange(16).reshape(1, 4, 4)]) +def test_mixed_precision_fp(sensitivity_metric, all_layers, ratio, ref_ids, mode, ov_type, group_size): + # Use hidden dim % 32 == 0 to make it possible to quantize in MX format + model = SequentialMatmulModel(mm_hidden_dim=32).ov_model + dataset = Dataset([np.ones([1, 4, 32]), np.arange(128).reshape(1, 4, 32)]) + kwargs = {} + if group_size is not None: + kwargs["group_size"] = group_size compressed_model = compress_weights( model, mode=mode, ratio=ratio, - group_size=1, all_layers=all_layers, sensitivity_metric=sensitivity_metric, dataset=dataset, + **kwargs, ) - names_fp = {op.get_friendly_name() for op in compressed_model.get_ordered_ops() if op.get_element_type() == ov_type} + ops = [] + for op in compressed_model.get_ordered_ops(): + if op.get_element_type() == ov_type: + # Check effective default group size == 32 + assert tuple(op.shape) == (32, 1, 32) + ops.append(op) + + names_fp = {op.get_friendly_name() for op in ops} ref_fp_nodes = {f"weights_{i}" for i in ref_ids} assert ref_fp_nodes == names_fp diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 9f3527106bf..7bf959e89e1 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -52,9 +52,8 @@ SUPPORTED_MODES = INT8_MODES + INT4_MODES UNSUPPORTED_MODES = ( CompressWeightsMode.NF4, - CompressWeightsMode.E2M1, - CompressWeightsMode.E5M2, - CompressWeightsMode.E4M3, + CompressWeightsMode.MXFP4, + CompressWeightsMode.MXFP8_E4M3, ) From 3d944de64e8cc377748919528631d9555657e0c8 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Wed, 1 Oct 2025 14:52:44 +0200 Subject: [PATCH 08/21] Expand wc docs with a table --- .../weights_compression/Usage.md | 28 ++++++++++++++++--- src/nncf/parameters.py | 4 +-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index 9ba3b6175dc..c2db88d78ee 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -21,9 +21,29 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod ### Supported modes -By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode. -OpenVINO backend also supports 5 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM, NF4, MXFP4 and MXFP8_E4M3. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. In case of MXFP4 mode - [e2m1](https://arxiv.org/pdf/2310.10537) data type without zero point and has 8bit [E8M0](https://arxiv.org/pdf/2310.10537) scale. In case of MXFP8_E4M3 mode - [e4m3](https://arxiv.org/pdf/2310.10537) data type without zero point and has 8bit [E8M0](https://arxiv.org/pdf/2310.10537) scale. -All 4-bit modes except MXFP4/MXFP8_E4M3 have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale). MXFP4 and MXFP8_E4M3 could be used only with group size == 32. +#### INT8 compression + +By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode. But "INT8_SYM" mode is available as well. + +| Mode | Element type | Scale type | Block Size | Description | +|-----------|----------------|--------------|--------------|---------------------------------------------------------------------------------------------------------------------------------| +| INT8_SYM | INT8 | FP16 | Per-channel | Stands for 8-bit integer [symmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) of all weights.Weights are quantized symmetrically without zero point. | +| INT8_ASYM | INT8 | FP16 | Per-channel | The same as INT8_SYM mode, but weights are quantized to a primary precision [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. | + +#### Mixed precision + +OpenVINO backend also supports 5 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM, NF4, MXFP4 and MXFP8_E4M3. + +| Mode | Element type | Scale type | Block Size | Description | +|------------|----------------------|--------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| INT4_SYM | INT4 | FP16 | Any | Weights are quantized to a primary precision symmetrically without zero point.
All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
criteria and the given ratio. | +| INT4_ASYM | INT4 | FP16 | Any | he same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically
with a typical non-fixed zero point. | +| NF4 | NF4 | FP16 | Any | The the same as INT4_SYM mode, but primary precision is [NF4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. | +| MXFP4 | E2M1 | E8M0 | 32 | FP4 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. | +| MXFP8_E4M3 | E4M3 | E8M0 | 32 | FP8 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). 4 bit exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale.| +| CODEBOOK | Any | FP16 | Any | Codebook (lookup table (LUT)) quantization format. | +| CB4_F8E4M3 | FP32 (NF4) -> E4M3 | FP16 | Any | Codebook (lookup table (LUT)) format with 16 fixed fp8 values in E4M3 format. | + All embeddings, convolutions and last linear layers are always compressed to a backup mode, which is "INT8_ASYM", by default. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`. Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to a backup mode. OpenVINO backend supports 3 backup modes: INT8_SYM, INT8_ASYM, and NONE, which retains the original floating-point precision of the model weights. Backup mode is supported only for mixed-precision weight quantization. @@ -197,7 +217,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.NF4) ```python from nncf import compress_weights, CompressWeightsMode -compressed_model = compress_weights(model, mode=CompressWeightsMode.MXFP4, group_size=32, all_layers=True) +compressed_model = compress_weights(model, mode=CompressWeightsMode.MXFP4, all_layers=True) ``` #### Caching Statistics diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index 383e1b40404..a01d8f2b4b1 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -90,10 +90,10 @@ class CompressWeightsMode(StrEnum): :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. :param MXFP4: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 2 bist exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. + 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. Group size is constant == 32. :param MXFP8_E4M3: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 4 bist exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. + 4 bits exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. Group size is constant == 32. :param CODEBOOK: Codebook (LUT) quantization format. :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. From 0c48792d3460f6b077d65c7b59d909ce58da5958 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Wed, 1 Oct 2025 14:54:26 +0200 Subject: [PATCH 09/21] Codebook is removed from wc docs --- .../weights_compression/Usage.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index c2db88d78ee..6d932fa17d3 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -39,10 +39,8 @@ OpenVINO backend also supports 5 modes of mixed precision weight quantization wi | INT4_SYM | INT4 | FP16 | Any | Weights are quantized to a primary precision symmetrically without zero point.
All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
criteria and the given ratio. | | INT4_ASYM | INT4 | FP16 | Any | he same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically
with a typical non-fixed zero point. | | NF4 | NF4 | FP16 | Any | The the same as INT4_SYM mode, but primary precision is [NF4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. | -| MXFP4 | E2M1 | E8M0 | 32 | FP4 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. | -| MXFP8_E4M3 | E4M3 | E8M0 | 32 | FP8 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). 4 bit exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale.| -| CODEBOOK | Any | FP16 | Any | Codebook (lookup table (LUT)) quantization format. | -| CB4_F8E4M3 | FP32 (NF4) -> E4M3 | FP16 | Any | Codebook (lookup table (LUT)) format with 16 fixed fp8 values in E4M3 format. | +| MXFP4 | E2M1 | E8M0 | 32 | FP4 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. | +| MXFP8_E4M3 | E4M3 | E8M0 | 32 | FP8 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 4 bit exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale.| All embeddings, convolutions and last linear layers are always compressed to a backup mode, which is "INT8_ASYM", by default. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`. Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to a backup mode. OpenVINO backend supports 3 backup modes: INT8_SYM, INT8_ASYM, and NONE, which retains the original floating-point precision of the model weights. Backup mode is supported only for mixed-precision weight quantization. @@ -213,7 +211,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.NF4) - `MXFP4` or `MXFP8_E4M3` modes can be considered for improving accuracy, but currently models quantized to mxfp4/mxfp8_e4m3 should not be faster models quantized to 8-bit asymmetric integer. Here's the example how to compress weights to mxfp4 data type with constant group size == 32. - Different `ratio` is also supported. + Different `ratio` are also supported. ```python from nncf import compress_weights, CompressWeightsMode From ac2f05c19404ac2d7d8e9c4e889125505834d6ac Mon Sep 17 00:00:00 2001 From: Daniil Lyakhov Date: Wed, 1 Oct 2025 17:51:12 +0200 Subject: [PATCH 10/21] Type --- .../post_training_compression/weights_compression/Usage.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index 6d932fa17d3..a8a0a7758ca 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -36,9 +36,9 @@ OpenVINO backend also supports 5 modes of mixed precision weight quantization wi | Mode | Element type | Scale type | Block Size | Description | |------------|----------------------|--------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| INT4_SYM | INT4 | FP16 | Any | Weights are quantized to a primary precision symmetrically without zero point.
All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
criteria and the given ratio. | -| INT4_ASYM | INT4 | FP16 | Any | he same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically
with a typical non-fixed zero point. | -| NF4 | NF4 | FP16 | Any | The the same as INT4_SYM mode, but primary precision is [NF4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. | +| INT4_SYM | INT4 | FP16 | Any | Weights are quantized to a primary precision symmetrically without zero point. All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM, by default. All others are quantized whether to 4-bit integer or to a backup precision depending on criteria and the given ratio. | +| INT4_ASYM | INT4 | FP16 | Any | The same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. | +| NF4 | NF4 | FP16 | Any | The the same as INT4_SYM mode, but primary precision is [NF4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. | | MXFP4 | E2M1 | E8M0 | 32 | FP4 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. | | MXFP8_E4M3 | E4M3 | E8M0 | 32 | FP8 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 4 bit exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale.| From 1e23ecf20a4af036f75121822a34573ef4f4b994 Mon Sep 17 00:00:00 2001 From: Daniil Lyakhov Date: Thu, 2 Oct 2025 17:46:10 +0200 Subject: [PATCH 11/21] Apply suggestions from code review Co-authored-by: Lyalyushkin Nikolay --- docs/Algorithms.md | 4 +- .../weights_compression/Usage.md | 44 +++++++++++-------- .../openvino/optimized_functions/functions.py | 4 +- src/nncf/parameters.py | 8 +--- .../weight_compression/algorithm.py | 6 +-- .../weight_compression/weight_lowering.py | 12 ++--- src/nncf/quantization/quantize_model.py | 12 ++--- 7 files changed, 47 insertions(+), 43 deletions(-) diff --git a/docs/Algorithms.md b/docs/Algorithms.md index 3a8d876c646..116777b0359 100644 --- a/docs/Algorithms.md +++ b/docs/Algorithms.md @@ -11,8 +11,8 @@ - Symmetric 8 bit compression mode - Symmetric and asymmetric 4 bit compression mode - NF4 compression mode - - MXFP4 compression model with E2M1 data type, E8M0 scales and group size == 32 - - MXFP8_E4M3 compression model with E4M3 data type, E8M0 scales and group size == 32 + - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3) + - MX-compliant types - MXFP4 and MXFP8_E4M3 - Mixed precision weights compression - Grouped weights compression diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index a8a0a7758ca..88ee3df7835 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -21,26 +21,36 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod ### Supported modes -#### INT8 compression +#### INT8 modes -By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode. But "INT8_SYM" mode is available as well. +By default, the algorithm applies asymmetric 8-bit integer quantization (INT8_ASYM mode) to all weights. For symmetric quantization without zero point, the INT8_SYM mode is also available. Both modes typically preserve model accuracy while providing decent performance improvements. -| Mode | Element type | Scale type | Block Size | Description | -|-----------|----------------|--------------|--------------|---------------------------------------------------------------------------------------------------------------------------------| -| INT8_SYM | INT8 | FP16 | Per-channel | Stands for 8-bit integer [symmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) of all weights.Weights are quantized symmetrically without zero point. | -| INT8_ASYM | INT8 | FP16 | Per-channel | The same as INT8_SYM mode, but weights are quantized to a primary precision [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. | +| Compression Mode | Element type | Scale type | Granularity | Description | +|------------------|--------------|------------|--------------------------|----------------------------| +| INT8_ASYM | INT8 | FP16 | Per-channel | [Asymmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) | +| INT8_SYM | INT8 | FP16 | Per-channel | [Symmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) | -#### Mixed precision +#### Mixed precision modes -OpenVINO backend also supports 5 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM, NF4, MXFP4 and MXFP8_E4M3. +Mixed precision modes offer higher compression rates leading to faster inference, though potentially with greater accuracy loss. These modes utilize two precision types: **primary** and **backup**. The primary precision is determined by the compression mode, while backup precision refers to a higher precision format (default is INT8_ASYM, configurable via the `backup_mode` parameter). -| Mode | Element type | Scale type | Block Size | Description | -|------------|----------------------|--------------|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| INT4_SYM | INT4 | FP16 | Any | Weights are quantized to a primary precision symmetrically without zero point. All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM, by default. All others are quantized whether to 4-bit integer or to a backup precision depending on criteria and the given ratio. | -| INT4_ASYM | INT4 | FP16 | Any | The same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. | -| NF4 | NF4 | FP16 | Any | The the same as INT4_SYM mode, but primary precision is [NF4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. | -| MXFP4 | E2M1 | E8M0 | 32 | FP4 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. | -| MXFP8_E4M3 | E4M3 | E8M0 | 32 | FP8 format from [OCP Microscaling Formats (MX) Specification Version 1.0.](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) 4 bit exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale.| +By default, NNCF assigns backup precision to **special** quantization-sensitive layers: embeddings, convolutions, and the last linear layer. To compress these special layers using primary precision instead, set `all_layers=True`. + +NNCF can automatically distribute precision assignments based on quantization sensitivity using the `ratio` parameter. For example, with `ratio=0.9`, layers (excluding special ones) accounting for 90% of model weights receive primary precision, while the remaining layers use backup precision. This distribution minimizes overall quality deterioration by prioritizing less sensitive layers for lower precision. + +| Compression Mode | Element type | Scale type | Granularity | Description | +|------------------|--------------|------------|--------------------------|-------------| +| INT4_SYM | INT4 | FP16 | Per-channel / Group-wise | [Symmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) | +| INT4_ASYM | INT4 | FP16 | Per-channel / Group-wise | [Asymmetric quantization](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) | +| NF4 | FP32 | FP16 | Per-channel / Group-wise | [NormalFloat-4](https://arxiv.org/pdf/2305.14314v1.pdf) lookup table with 16 FP32 values | +| CODEBOOK | Any | FP16 | Per-channel / Group-wise | Arbitrary lookup table (codebook) | +| CB4_F8E4M3 | E4M3 | FP16 | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values | +| MXFP4 | E2M1 | E8M0 | Group-wise (32) | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) | +| MXFP8_E4M3 | E4M3 | E8M0 | Group-wise (32) | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) | + +**Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters. + +**Note**: ExMy is a notation for floating point formats with one sign bit, x exponent bits, and y mantissa bits. For example, E4M3 refers to an FP8 format with one sign bit, four exponent bits, and three mantissa bits. Formats like E8M0 don't include the mantissa bits. All embeddings, convolutions and last linear layers are always compressed to a backup mode, which is "INT8_ASYM", by default. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`. Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to a backup mode. OpenVINO backend supports 3 backup modes: INT8_SYM, INT8_ASYM, and NONE, which retains the original floating-point precision of the model weights. Backup mode is supported only for mixed-precision weight quantization. @@ -209,9 +219,7 @@ from nncf import compress_weights, CompressWeightsMode compressed_model = compress_weights(model, mode=CompressWeightsMode.NF4) ``` -- `MXFP4` or `MXFP8_E4M3` modes can be considered for improving accuracy, but currently models quantized to mxfp4/mxfp8_e4m3 should not be faster models - quantized to 8-bit asymmetric integer. Here's the example how to compress weights to mxfp4 data type with constant group size == 32. - Different `ratio` are also supported. +- Here's the example how to compress weights to MXFP4. Different `ratio` are also supported. ```python from nncf import compress_weights, CompressWeightsMode diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index 40b65fe07ee..622a63c1b7b 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -109,13 +109,13 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding nf4 weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved + TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP4 and MXFP8_E3M3 normalized) weight tensor and corresponding scale tensor. + :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor. """ assert config.mode == CompressWeightsMode.NF4 diff --git a/src/nncf/parameters.py b/src/nncf/parameters.py index a01d8f2b4b1..9a59950d605 100644 --- a/src/nncf/parameters.py +++ b/src/nncf/parameters.py @@ -89,12 +89,8 @@ class CompressWeightsMode(StrEnum): https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization :param NF4: The the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead. - :param MXFP4: FP4 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 2 bits exponent, 1 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. - Group size is constant == 32. - :param MXFP8_E4M3: FP8 format from "OCP Microscaling Formats (MX) Specification" Version 1.0. - 4 bits exponent, 3 bit mantissa for the weight and 8 bit exponent, 0 bit mantissa scale. - Group size is constant == 32. + :param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32. + :param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32. :param CODEBOOK: Codebook (LUT) quantization format. :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format. """ diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index f8585c1fb53..c741e163ab7 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -246,7 +246,7 @@ def check_user_compression_configuration( ) raise nncf.ValidationError(msg) if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] and group_size not in [None, 32]: - msg = f"MXFP4 and MXFP8_E4M3 types only support group size == 32, group size == {group_size} is given" + msg = f"MXFP4 and MXFP8_E4M3 types only support group size of 32, group size of {group_size} is given" raise nncf.ValidationError(msg) @@ -289,8 +289,8 @@ def __init__( INT4_ASYM is the same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. - MXFP4 has E2M1 weight dtype with E8M0 scale, group size 32 and no zero point. - MXFP8_E4M3 has E4M3 weight dtype with E8M0 scale, group size 32 and no zero point. + MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32. + MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32. :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to backup_mode). :param group_size: number of weights (e.g. 128) in the channel dimension diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 8d2eb5729a8..684f4f2c192 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -148,14 +148,14 @@ def do_float_quantization( Computes quantization scale if not provided, and performs corresponding (nf4, MXFP4 and MXFP8_E3M3) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For MXFP4, MXFP8_E3M3 and CODEBOOK currently returns normalized weight without quantization. - TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved + For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. + TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP4 and MXFP8_E3M3 normalized) weight tensor and corresponding scale tensor and + :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and optional indexes for codebook. """ assert not config.is_integer @@ -192,7 +192,7 @@ def do_float_quantization( ) return compressed_weight, scale, indexes else: - # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3 once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved compressed_weight = norm_weight return compressed_weight, scale, None @@ -206,7 +206,7 @@ def float_quantize_dequantize_weight( ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values. - MXFP4 and MXFP8_E3M3 mode is currently not supported. + MXFP4 and MXFP8_E4M3 mode is currently not supported. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -216,7 +216,7 @@ def float_quantize_dequantize_weight( :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] - # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E3M3, once ticket 164851 is resolved + # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3, once ticket 164851 is resolved # Optimized implementation if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index be01081256a..3e6f5c9979b 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -458,8 +458,8 @@ def compress_weights( INT4_ASYM is the same as INT4_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. - MXFP4 has E2M1 data dtype with E8M0 scale, group size 32 and no zero point. - MXFP8_E4M3 has E4M3 data dtype with E8M0 scale, group size 32 and no zero point. + MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32. + MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale. The size of group is 32. :type mode: nncf.CompressWeightsMode :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to INT8_ASYM). @@ -524,7 +524,7 @@ def compress_weights( CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = {"gptq": gptq, "lora_correction": lora_correction} @@ -574,7 +574,7 @@ def compress_weights( CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "Torch backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." + msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { @@ -615,7 +615,7 @@ def compress_weights( CompressWeightsMode.MXFP8_E4M3, ]: msg = ( - "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E3M3]." + "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]." ) raise nncf.ParameterNotSupportedError(msg) @@ -639,7 +639,7 @@ def compress_weights( CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3, ]: - msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E3M3 and CODEBOOK modes for weight compression." + msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression." raise nncf.ParameterNotSupportedError(msg) options = { From 33aae33adb053c8fff3d4198a1124da757a487e5 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Thu, 2 Oct 2025 18:01:15 +0200 Subject: [PATCH 12/21] Typos/pre-commit --- docs/Algorithms.md | 2 +- .../algorithms/weight_compression/weight_lowering.py | 4 ++-- src/nncf/quantization/quantize_model.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/Algorithms.md b/docs/Algorithms.md index 116777b0359..6c5cc0e9f99 100644 --- a/docs/Algorithms.md +++ b/docs/Algorithms.md @@ -11,7 +11,7 @@ - Symmetric 8 bit compression mode - Symmetric and asymmetric 4 bit compression mode - NF4 compression mode - - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3) + - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3) - MX-compliant types - MXFP4 and MXFP8_E4M3 - Mixed precision weights compression - Grouped weights compression diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 684f4f2c192..88642cbd4d9 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -81,7 +81,7 @@ def calculate_float_quantization_params( weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig ) -> Tensor: """ - Calculates the scale for nf4 or mxfp4/mxfp8_e3m3 quantization. + Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization. :param weight: Weight array to compress. :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max). @@ -146,7 +146,7 @@ def do_float_quantization( ) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, - and performs corresponding (nf4, MXFP4 and MXFP8_E3M3) weight quantization. + and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization. For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 3e6f5c9979b..eb15cd08cbe 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -459,7 +459,8 @@ def compress_weights( with a typical non-fixed zero point. NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point. MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32. - MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale. The size of group is 32. + MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale. + The size of group is 32. :type mode: nncf.CompressWeightsMode :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4 and the rest to INT8_ASYM). From e4d47abb90eb41295e5985b995a625c450f60c50 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Tue, 7 Oct 2025 19:51:21 +0200 Subject: [PATCH 13/21] Fix adjust group size --- .../weight_compression/algorithm.py | 16 ++-- .../template_test_weights_compression.py | 86 +++++++++++++++++-- .../quantization/test_weights_compression.py | 8 ++ .../quantization/test_weights_compression.py | 12 +++ .../quantization/test_weights_compression.py | 8 ++ tests/torch2/fx/test_compress_weights.py | 8 ++ 6 files changed, 124 insertions(+), 14 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index c741e163ab7..c68942ca26a 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -623,13 +623,15 @@ def _handle_adjust_group_size_fallback( group_size_values[w_params.weight_name] = self._group_size continue - # The maximal power of two that divides reduction_channel_size - adjusted_group_size = reduction_channel_size & (~reduction_channel_size + 1) - if adjusted_group_size >= self._min_adjusted_group_size: - valid_weight_params.append(w_params) - group_size_values[w_params.weight_name] = adjusted_group_size - adjusted_weight_params.append((w_params, adjusted_group_size)) - continue + # Do not adjust group size for the MX data types + if w_params.compression_config.mode not in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: + # The maximal power of two that divides reduction_channel_size + adjusted_group_size = reduction_channel_size & (~reduction_channel_size + 1) + if adjusted_group_size >= self._min_adjusted_group_size: + valid_weight_params.append(w_params) + group_size_values[w_params.weight_name] = adjusted_group_size + adjusted_weight_params.append((w_params, adjusted_group_size)) + continue invalid_weight_params.append(w_params) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 8eafa3e767e..188dd47820b 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -173,6 +173,13 @@ def get_not_supported_algorithms() -> list[str]: Returns a list of not supported weight compression algorithms. """ + @staticmethod + @abstractmethod + def get_not_supported_modes() -> list[CompressWeightsMode]: + """ + Returns a list of not supported weight compression algorithms. + """ + @staticmethod @abstractmethod def wrap_model(model, data) -> CompressionParams: @@ -349,6 +356,11 @@ def get_num_int4_nodes(model: TModel): def get_num_int4_group_sizes(model: TModel) -> dict[int, int]: "Returns number of int4 nodes for each group size." + @staticmethod + @abstractmethod + def get_num_mx_group_sizes(model: TModel) -> dict[int, int]: + "Returns number of int4 nodes for each group size." + @staticmethod @abstractmethod def get_ignored_scope_name() -> str: @@ -505,13 +517,64 @@ def test_error_message_for_invalid_group_size( "fallback_mode", "min_adjusted_group_size", "ref_num_group_sizes", + "mode", ], [ - ([8, 8, 16, 16, 16, 32], 1.0, 32, None, None, {32: 1}), - ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.IGNORE, None, {32: 1}), - ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.ADJUST, 16, {16: 3, 32: 1}), - ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.ADJUST, 32, {32: 1}), - ([8, 8, 16, 16, 16, 32], 0.5, 32, nncf.GroupSizeFallbackMode.ADJUST, 16, {16: 2}), + ([8, 8, 16, 16, 16, 32], 1.0, 32, None, None, {32: 1}, CompressWeightsMode.INT4_SYM), + ( + [8, 8, 16, 16, 16, 32], + 1.0, + 32, + nncf.GroupSizeFallbackMode.IGNORE, + None, + {32: 1}, + CompressWeightsMode.INT4_SYM, + ), + ( + [8, 8, 16, 16, 16, 32], + 1.0, + 32, + nncf.GroupSizeFallbackMode.ADJUST, + 16, + {16: 3, 32: 1}, + CompressWeightsMode.INT4_SYM, + ), + ( + [8, 8, 16, 16, 16, 32], + 1.0, + 32, + nncf.GroupSizeFallbackMode.ADJUST, + 32, + {32: 1}, + CompressWeightsMode.INT4_SYM, + ), + ( + [8, 8, 16, 16, 16, 32], + 0.5, + 32, + nncf.GroupSizeFallbackMode.ADJUST, + 16, + {16: 2}, + CompressWeightsMode.INT4_SYM, + ), + ( + [8, 8, 16, 16, 16, 32], + 1.0, + 32, + nncf.GroupSizeFallbackMode.ADJUST, + 32, + {32: 1}, + CompressWeightsMode.MXFP4, + ), + ( + [8, 8, 16, 16, 16, 32], + 1.0, + 32, + nncf.GroupSizeFallbackMode.ADJUST, + 32, + {32: 1}, + CompressWeightsMode.MXFP8_E4M3, + ), ], ) def test_group_size_fallback_modes( @@ -522,13 +585,17 @@ def test_group_size_fallback_modes( fallback_mode, min_adjusted_group_size, ref_num_group_sizes, + mode, ): + if mode in self.get_not_supported_modes(): + pytest.skip("Skipping test for not supported modes") + model = self.get_different_channel_size_model(model_channel_sizes) input_example = self.to_tensor(np.ones([1, model_channel_sizes[0], model_channel_sizes[0]], dtype=np.float32)) dataset = Dataset([input_example], self.get_transform_func()) kwargs = dict( model=model, - mode=CompressWeightsMode.INT4_SYM, + mode=mode, ratio=ratio, all_layers=True, group_size=group_size, @@ -542,7 +609,12 @@ def test_group_size_fallback_modes( model = compress_weights(**kwargs) - num_group_sizes = self.get_num_int4_group_sizes(model) + num_group_sizes = {} + if mode == CompressWeightsMode.INT4_SYM: + num_group_sizes = self.get_num_int4_group_sizes(model) + if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: + num_group_sizes = self.get_num_mx_group_sizes(model) + assert ref_num_group_sizes == num_group_sizes, ( f"Expected {ref_num_group_sizes} group size values, but got {num_group_sizes}." ) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 534b771bdd7..6b358e3571a 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -431,6 +431,10 @@ def check_weights(model: onnx.ModelProto, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["gptq", "lora_correction"] + @staticmethod + def get_not_supported_modes() -> list[CompressWeightsMode]: + return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] + @staticmethod def wrap_model(model: onnx.ModelProto, data: Any) -> onnx.ModelProto: return model @@ -604,6 +608,10 @@ def get_num_int4_group_sizes(model: onnx.ModelProto) -> dict[int, int]: num[shape[-1]] += 1 return num + @staticmethod + def get_num_mx_group_sizes(model: onnx.ModelProto) -> dict[int, int]: + return {} + @staticmethod def get_ignored_scope_name() -> str: return "MatMul_4" # Zero-based indices (e.g., MatMul_0, MatMul_1, ...) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 6d1eb3e1762..19aaa5b199c 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1868,6 +1868,10 @@ def check_weights(model: ov.Model, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return [] + @staticmethod + def get_not_supported_modes() -> list[CompressWeightsMode]: + return [] + @staticmethod def wrap_model(model, data): return model @@ -1933,6 +1937,14 @@ def get_num_int4_group_sizes(model: ov.Model) -> dict[int, int]: num[op.get_output_shape(0)[-1]] += 1 return num + @staticmethod + def get_num_mx_group_sizes(model: ov.Model) -> dict[int, int]: + num = defaultdict(int) + for op in model.get_ops(): + if op.get_type_name() == "Constant" and op.get_element_type() in [ov.Type.f8e4m3, ov.Type.f4e2m1]: + num[op.get_output_shape(0)[-1]] += 1 + return num + @pytest.fixture(params=INT4_NF4_MODES) def int4_mode(self, request): return request.param diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 7bf959e89e1..e16b0eb02c2 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -522,6 +522,10 @@ def check_weights(model: torch.nn.Module, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["lora_correction", "gptq"] + @staticmethod + def get_not_supported_modes() -> list[CompressWeightsMode]: + return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] + @staticmethod def wrap_model(model, data): model = wrap_model(model) @@ -581,6 +585,10 @@ def get_num_int4_group_sizes(model: torch.nn.Module) -> dict[int, int]: num[op.compressed_weight_shape[-1]] += 1 return num + @staticmethod + def get_num_mx_group_sizes(model: torch.nn.Module) -> dict[int, int]: + return {} + @pytest.fixture(params=INT4_MODES) def int4_mode(self, request): return request.param diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index 4acf7fa397d..2e49dfcb2f8 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -386,6 +386,10 @@ def check_weights(model: torch.fx.GraphModule, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["lora_correction", "gptq"] + @staticmethod + def get_not_supported_modes() -> list[CompressWeightsMode]: + return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] + @staticmethod def wrap_model(model, data): if isinstance(model, torch.fx.GraphModule): @@ -457,6 +461,10 @@ def get_num_int4_group_sizes(model: torch.nn.Module) -> dict[int, int]: num[op.compressed_weight_shape[-1]] += 1 return num + @staticmethod + def get_num_mx_group_sizes(model: torch.nn.Module) -> dict[int, int]: + return {} + @pytest.fixture(params=INT4_MODES) def int4_mode(self, request): return request.param From 2aaec38493aabbf094d36ebd79017cb347cce8b0 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Wed, 8 Oct 2025 13:38:20 +0200 Subject: [PATCH 14/21] Revert "Fix adjust group size" This reverts commit e4d47abb90eb41295e5985b995a625c450f60c50. --- .../weight_compression/algorithm.py | 16 ++-- .../template_test_weights_compression.py | 86 ++----------------- .../quantization/test_weights_compression.py | 8 -- .../quantization/test_weights_compression.py | 12 --- .../quantization/test_weights_compression.py | 8 -- tests/torch2/fx/test_compress_weights.py | 8 -- 6 files changed, 14 insertions(+), 124 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index c68942ca26a..c741e163ab7 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -623,15 +623,13 @@ def _handle_adjust_group_size_fallback( group_size_values[w_params.weight_name] = self._group_size continue - # Do not adjust group size for the MX data types - if w_params.compression_config.mode not in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: - # The maximal power of two that divides reduction_channel_size - adjusted_group_size = reduction_channel_size & (~reduction_channel_size + 1) - if adjusted_group_size >= self._min_adjusted_group_size: - valid_weight_params.append(w_params) - group_size_values[w_params.weight_name] = adjusted_group_size - adjusted_weight_params.append((w_params, adjusted_group_size)) - continue + # The maximal power of two that divides reduction_channel_size + adjusted_group_size = reduction_channel_size & (~reduction_channel_size + 1) + if adjusted_group_size >= self._min_adjusted_group_size: + valid_weight_params.append(w_params) + group_size_values[w_params.weight_name] = adjusted_group_size + adjusted_weight_params.append((w_params, adjusted_group_size)) + continue invalid_weight_params.append(w_params) diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 188dd47820b..8eafa3e767e 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -173,13 +173,6 @@ def get_not_supported_algorithms() -> list[str]: Returns a list of not supported weight compression algorithms. """ - @staticmethod - @abstractmethod - def get_not_supported_modes() -> list[CompressWeightsMode]: - """ - Returns a list of not supported weight compression algorithms. - """ - @staticmethod @abstractmethod def wrap_model(model, data) -> CompressionParams: @@ -356,11 +349,6 @@ def get_num_int4_nodes(model: TModel): def get_num_int4_group_sizes(model: TModel) -> dict[int, int]: "Returns number of int4 nodes for each group size." - @staticmethod - @abstractmethod - def get_num_mx_group_sizes(model: TModel) -> dict[int, int]: - "Returns number of int4 nodes for each group size." - @staticmethod @abstractmethod def get_ignored_scope_name() -> str: @@ -517,64 +505,13 @@ def test_error_message_for_invalid_group_size( "fallback_mode", "min_adjusted_group_size", "ref_num_group_sizes", - "mode", ], [ - ([8, 8, 16, 16, 16, 32], 1.0, 32, None, None, {32: 1}, CompressWeightsMode.INT4_SYM), - ( - [8, 8, 16, 16, 16, 32], - 1.0, - 32, - nncf.GroupSizeFallbackMode.IGNORE, - None, - {32: 1}, - CompressWeightsMode.INT4_SYM, - ), - ( - [8, 8, 16, 16, 16, 32], - 1.0, - 32, - nncf.GroupSizeFallbackMode.ADJUST, - 16, - {16: 3, 32: 1}, - CompressWeightsMode.INT4_SYM, - ), - ( - [8, 8, 16, 16, 16, 32], - 1.0, - 32, - nncf.GroupSizeFallbackMode.ADJUST, - 32, - {32: 1}, - CompressWeightsMode.INT4_SYM, - ), - ( - [8, 8, 16, 16, 16, 32], - 0.5, - 32, - nncf.GroupSizeFallbackMode.ADJUST, - 16, - {16: 2}, - CompressWeightsMode.INT4_SYM, - ), - ( - [8, 8, 16, 16, 16, 32], - 1.0, - 32, - nncf.GroupSizeFallbackMode.ADJUST, - 32, - {32: 1}, - CompressWeightsMode.MXFP4, - ), - ( - [8, 8, 16, 16, 16, 32], - 1.0, - 32, - nncf.GroupSizeFallbackMode.ADJUST, - 32, - {32: 1}, - CompressWeightsMode.MXFP8_E4M3, - ), + ([8, 8, 16, 16, 16, 32], 1.0, 32, None, None, {32: 1}), + ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.IGNORE, None, {32: 1}), + ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.ADJUST, 16, {16: 3, 32: 1}), + ([8, 8, 16, 16, 16, 32], 1.0, 32, nncf.GroupSizeFallbackMode.ADJUST, 32, {32: 1}), + ([8, 8, 16, 16, 16, 32], 0.5, 32, nncf.GroupSizeFallbackMode.ADJUST, 16, {16: 2}), ], ) def test_group_size_fallback_modes( @@ -585,17 +522,13 @@ def test_group_size_fallback_modes( fallback_mode, min_adjusted_group_size, ref_num_group_sizes, - mode, ): - if mode in self.get_not_supported_modes(): - pytest.skip("Skipping test for not supported modes") - model = self.get_different_channel_size_model(model_channel_sizes) input_example = self.to_tensor(np.ones([1, model_channel_sizes[0], model_channel_sizes[0]], dtype=np.float32)) dataset = Dataset([input_example], self.get_transform_func()) kwargs = dict( model=model, - mode=mode, + mode=CompressWeightsMode.INT4_SYM, ratio=ratio, all_layers=True, group_size=group_size, @@ -609,12 +542,7 @@ def test_group_size_fallback_modes( model = compress_weights(**kwargs) - num_group_sizes = {} - if mode == CompressWeightsMode.INT4_SYM: - num_group_sizes = self.get_num_int4_group_sizes(model) - if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: - num_group_sizes = self.get_num_mx_group_sizes(model) - + num_group_sizes = self.get_num_int4_group_sizes(model) assert ref_num_group_sizes == num_group_sizes, ( f"Expected {ref_num_group_sizes} group size values, but got {num_group_sizes}." ) diff --git a/tests/onnx/quantization/test_weights_compression.py b/tests/onnx/quantization/test_weights_compression.py index 6b358e3571a..534b771bdd7 100644 --- a/tests/onnx/quantization/test_weights_compression.py +++ b/tests/onnx/quantization/test_weights_compression.py @@ -431,10 +431,6 @@ def check_weights(model: onnx.ModelProto, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["gptq", "lora_correction"] - @staticmethod - def get_not_supported_modes() -> list[CompressWeightsMode]: - return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] - @staticmethod def wrap_model(model: onnx.ModelProto, data: Any) -> onnx.ModelProto: return model @@ -608,10 +604,6 @@ def get_num_int4_group_sizes(model: onnx.ModelProto) -> dict[int, int]: num[shape[-1]] += 1 return num - @staticmethod - def get_num_mx_group_sizes(model: onnx.ModelProto) -> dict[int, int]: - return {} - @staticmethod def get_ignored_scope_name() -> str: return "MatMul_4" # Zero-based indices (e.g., MatMul_0, MatMul_1, ...) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 19aaa5b199c..6d1eb3e1762 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1868,10 +1868,6 @@ def check_weights(model: ov.Model, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return [] - @staticmethod - def get_not_supported_modes() -> list[CompressWeightsMode]: - return [] - @staticmethod def wrap_model(model, data): return model @@ -1937,14 +1933,6 @@ def get_num_int4_group_sizes(model: ov.Model) -> dict[int, int]: num[op.get_output_shape(0)[-1]] += 1 return num - @staticmethod - def get_num_mx_group_sizes(model: ov.Model) -> dict[int, int]: - num = defaultdict(int) - for op in model.get_ops(): - if op.get_type_name() == "Constant" and op.get_element_type() in [ov.Type.f8e4m3, ov.Type.f4e2m1]: - num[op.get_output_shape(0)[-1]] += 1 - return num - @pytest.fixture(params=INT4_NF4_MODES) def int4_mode(self, request): return request.param diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index e16b0eb02c2..7bf959e89e1 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -522,10 +522,6 @@ def check_weights(model: torch.nn.Module, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["lora_correction", "gptq"] - @staticmethod - def get_not_supported_modes() -> list[CompressWeightsMode]: - return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] - @staticmethod def wrap_model(model, data): model = wrap_model(model) @@ -585,10 +581,6 @@ def get_num_int4_group_sizes(model: torch.nn.Module) -> dict[int, int]: num[op.compressed_weight_shape[-1]] += 1 return num - @staticmethod - def get_num_mx_group_sizes(model: torch.nn.Module) -> dict[int, int]: - return {} - @pytest.fixture(params=INT4_MODES) def int4_mode(self, request): return request.param diff --git a/tests/torch2/fx/test_compress_weights.py b/tests/torch2/fx/test_compress_weights.py index 2e49dfcb2f8..4acf7fa397d 100644 --- a/tests/torch2/fx/test_compress_weights.py +++ b/tests/torch2/fx/test_compress_weights.py @@ -386,10 +386,6 @@ def check_weights(model: torch.fx.GraphModule, ref_ids: list[int]) -> None: def get_not_supported_algorithms() -> list[str]: return ["lora_correction", "gptq"] - @staticmethod - def get_not_supported_modes() -> list[CompressWeightsMode]: - return [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] - @staticmethod def wrap_model(model, data): if isinstance(model, torch.fx.GraphModule): @@ -461,10 +457,6 @@ def get_num_int4_group_sizes(model: torch.nn.Module) -> dict[int, int]: num[op.compressed_weight_shape[-1]] += 1 return num - @staticmethod - def get_num_mx_group_sizes(model: torch.nn.Module) -> dict[int, int]: - return {} - @pytest.fixture(params=INT4_MODES) def int4_mode(self, request): return request.param From ab6aa74452cb0d05fbcf312a8df556dd1798b3a8 Mon Sep 17 00:00:00 2001 From: dlyakhov Date: Wed, 8 Oct 2025 13:44:55 +0200 Subject: [PATCH 15/21] Fail for MX with adjust fallback mode --- .../algorithms/weight_compression/algorithm.py | 15 ++++++++++++--- .../quantization/test_weights_compression.py | 12 ++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/algorithm.py b/src/nncf/quantization/algorithms/weight_compression/algorithm.py index c741e163ab7..50a5399a5c8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/src/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -245,9 +245,18 @@ def check_user_compression_configuration( f"Supported modes are: {[e.value for e in GroupSizeFallbackMode]}." ) raise nncf.ValidationError(msg) - if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] and group_size not in [None, 32]: - msg = f"MXFP4 and MXFP8_E4M3 types only support group size of 32, group size of {group_size} is given" - raise nncf.ValidationError(msg) + if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: + if group_size not in [None, 32]: + msg = f"MXFP4 and MXFP8_E4M3 types only support group size of 32, group size of {group_size} is given" + raise nncf.ValidationError(msg) + + if advanced_parameters and advanced_parameters.group_size_fallback_mode is GroupSizeFallbackMode.ADJUST: + msg = ( + "MXFP4 and MXFP8_E4M3 types do not support the group size" + f" fallback mode {advanced_parameters.group_size_fallback_mode.value}." + " Please use other group size fallback mode." + ) + raise nncf.ValidationError(msg) class WeightCompression(Algorithm): diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 6d1eb3e1762..017d695c25e 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -41,6 +41,7 @@ from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams +from nncf.quantization.advanced_parameters import GroupSizeFallbackMode from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -809,6 +810,17 @@ def test_raise_error_with_unsupported_group_size_for_fp(self, mode): with pytest.raises(nncf.ValidationError): compress_weights(ov.Model([], []), dataset="anything", mode=mode, group_size=64) + def test_raise_error_with_unsupported_(self, mode): + with pytest.raises(nncf.ValidationError): + compress_weights( + ov.Model([], []), + dataset="anything", + mode=mode, + advanced_parameters=AdvancedCompressionParameters( + group_size_fallback_mode=GroupSizeFallbackMode.ADJUST + ), + ) + @pytest.mark.parametrize("mode", INT4_NF4_MODES) @pytest.mark.parametrize( From a25b5c39c46e34f28617cadfaddd454841885207 Mon Sep 17 00:00:00 2001 From: Daniil Lyakhov Date: Wed, 8 Oct 2025 14:42:09 +0200 Subject: [PATCH 16/21] Update src/nncf/quantization/algorithms/weight_compression/weight_lowering.py Co-authored-by: andreyanufr --- .../algorithms/weight_compression/weight_lowering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 88642cbd4d9..0e0783cf468 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -110,7 +110,7 @@ def calculate_float_quantization_params( scale = fns.where(fns.abs(scale) < eps, eps, scale) if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: - # FP types are using E8M0 type scale. + # MXFP types are using E8M0 type scale. # It can only contain values [2**(-127), 2**(-126), ..., 2**(126), 2**(127)]. # Here, we quantize each element of the scale to the smallest possible value greater than or equal to # the element value to make it possible to convert the float scale value to a FP format without rounding. From c026573a883cc7a0f2f6bb81309b9dcdb3f5467c Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 8 Oct 2025 15:30:08 +0200 Subject: [PATCH 17/21] Revert nightly installation --- .github/workflows/call_precommit.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/call_precommit.yml b/.github/workflows/call_precommit.yml index 93879e8a5b6..6c2f58ae9ce 100644 --- a/.github/workflows/call_precommit.yml +++ b/.github/workflows/call_precommit.yml @@ -91,8 +91,6 @@ jobs: shell: bash - name: Install NNCF and test requirements run: pip install . -r tests/openvino/requirements.txt - - name: Install OpenVINO nightly - run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Print installed modules run: pip list - name: Run OV precommit test scope From a000f87f94af798c8ede8a126f12250664db1360 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 8 Oct 2025 15:43:28 +0200 Subject: [PATCH 18/21] Post-merge fixes --- src/nncf/openvino/optimized_functions/models.py | 2 +- .../algorithms/weight_compression/weight_lowering.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 35478fd5d0c..f64b5d020a7 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -626,7 +626,7 @@ def _build_float_quantization_model( eps = np.finfo(np.float32).eps scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) - if config.mode == CompressWeightsMode.E2M1: + if config.mode == CompressWeightsMode.MXFP4: scale = scale / opset.constant(6.0, ov.Type.f32) scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32)) scale = opset.ceil(scale) diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 34afe34067f..4756b588e8f 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -185,7 +185,7 @@ def do_float_quantization( norm_weight = _calculate_normalized_weight(weight, scale) if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]: if original_weight_backend == TensorBackend.ov: - # Can convert through OpenVINO and return OpenVINO-native NF4 tensor + # Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype) else: From 831bf25e919a33d3274292d3c3f0fb3788958396 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 8 Oct 2025 16:00:06 +0200 Subject: [PATCH 19/21] Post-merge fixes part 2 --- .../optimized_functions/test_compression_functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 4a0bd9da077..c004a4b2d68 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -70,8 +70,8 @@ class QuantizationTask(Enum): FP4_COMPRESSION_CONFIGS = [ WeightCompressionConfig(CompressWeightsMode.NF4), WeightCompressionConfig(CompressWeightsMode.NF4, group_size=2), - WeightCompressionConfig(CompressWeightsMode.E2M1), - WeightCompressionConfig(CompressWeightsMode.E2M1, group_size=2), + WeightCompressionConfig(CompressWeightsMode.MXFP4), + WeightCompressionConfig(CompressWeightsMode.MXFP4, group_size=2), ] COMPRESSION_CONFIGS = INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS @@ -376,14 +376,14 @@ def get_input_node_data(node: ov.Node, input_id: int) -> Tensor: or compression_kwargs.get("lora_correction") ) - if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM, CompressWeightsMode.E2M1]: + if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM, CompressWeightsMode.MXFP4]: if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM] and weight_dtype in [ TensorDataType.f8e4m3, TensorDataType.f8e5m2, ]: pytest.skip("INT8 compression is not supported for f8 dtypes.") if is_data_aware: - pytest.skip("Data-aware compression is not supported for INT8 or F4E2M1 modes.") + pytest.skip("Data-aware compression is not supported for INT8 or MXFP4 modes.") else: compression_kwargs["all_layers"] = True From 344b94becfd11198e0fdfa67ed8c409aebb4461e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 8 Oct 2025 16:56:57 +0200 Subject: [PATCH 20/21] Increase test weight channel size --- .../test_compression_functions.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index c004a4b2d68..8803aad7d59 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -70,13 +70,12 @@ class QuantizationTask(Enum): FP4_COMPRESSION_CONFIGS = [ WeightCompressionConfig(CompressWeightsMode.NF4), WeightCompressionConfig(CompressWeightsMode.NF4, group_size=2), - WeightCompressionConfig(CompressWeightsMode.MXFP4), - WeightCompressionConfig(CompressWeightsMode.MXFP4, group_size=2), + WeightCompressionConfig(CompressWeightsMode.MXFP4, group_size=32), ] COMPRESSION_CONFIGS = INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS -WEIGHT_SHAPE = (10000, 4) +WEIGHT_SHAPE = (10000, 32) REDUCTION_AXES = (1,) @@ -308,7 +307,9 @@ def test_integer_quantization_error_alignment(weight_shape, config, tensor_backe else: mock.assert_called_once() - _check_values(results) + # It seems like numpy and openvino summate elements in different order during reduce_sum / reduce_mean computation. + # This results in small numerical differences. + _check_values(results, atol=1e-6) @pytest.mark.xfail( @@ -510,9 +511,9 @@ def _check_backends_and_dtypes( assert decompressed_weight.dtype == TensorDataType.float32 -def _check_values(results): +def _check_values(results, atol=0.0): def format_list_of_floats(lst): - return ", ".join(f"{x:.6f}" for x in lst) + return ", ".join(f"{x:.10f}" for x in lst) # Check that the computed tensors are equal between implementations keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy])) @@ -521,16 +522,16 @@ def format_list_of_floats(lst): ov_result = results[ComputationBackend.OV][key] if isinstance(numpy_result, float) and isinstance(ov_result, float): - numpy_result = np.array([numpy_result], dtype=np.float32) - ov_result = np.array([ov_result], dtype=np.float32) + numpy_result = Tensor(np.array([numpy_result], dtype=np.float32)) + ov_result = Tensor(np.array([ov_result], dtype=np.float32)) # Note: For static-shaped OV models doing asymmetric compression with convertable divisions there maybe # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511. try: - np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=0, rtol=0) + np.testing.assert_allclose(ov_result.data, numpy_result.data, atol=atol, rtol=0) except AssertionError: - not_equal_mask = ov_result.data != numpy_result.data + not_equal_mask = np.not_equal(ov_result.data, numpy_result.data) msg = ( f"Results do not align for {key} with " f"{not_equal_mask.sum() / ov_result.data.size * 100:.2f} % misalignment ratio.\n" From 089631aec9bd21aa6ecfa35965917c66d494204e Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 13 Oct 2025 17:29:34 +0200 Subject: [PATCH 21/21] Address suggested changes --- .../openvino/optimized_functions/functions.py | 3 +-- .../weight_compression/weight_lowering.py | 3 +-- .../template_test_nncf_tensor.py | 8 +++--- .../quantization/test_weights_compression.py | 27 +++++++++++++++++++ 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index 292518105ab..455ecb18279 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -108,8 +108,7 @@ def do_float_quantization( ) -> tuple[Tensor, Tensor, Tensor]: """ Computes quantization scale if not provided, and performs corresponding float weight quantization. - For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For MXFP4 quantization quantizes the weights to 16 levels on [-6, 6] interval. + NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6]. :param weight: Weight array to compress. :param config: Weight compression configuration. diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 4756b588e8f..6d17dd28870 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -149,8 +149,7 @@ def do_float_quantization( """ Computes quantization scale if not provided, and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization. - For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval. - For MXFP4 quantization quantizes the weights to 16 levels on [-6, 6] interval. + NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6]. For MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization. For CODEBOOK currently returns normalized weight without quantization. diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index 6545ca5fb8b..fb1aa3d00ec 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -116,7 +116,7 @@ def test_operators_bool(self, op_name, value): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", OPERATOR_MAP.keys()) @@ -136,7 +136,7 @@ def test_operators_tensor(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", OPERATOR_MAP.keys()) @@ -155,7 +155,7 @@ def test_operators_int(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", BINARY_OPERATORS) @@ -174,7 +174,7 @@ def test_operators_int_rev(self, op_name): assert isinstance(res_nncf, Tensor) if ( self.backend() != TensorBackend.tf - ): # native Tensorflow operaors do not guarantee to return a tensor on an initial device. + ): # native Tensorflow operators do not guarantee to return a tensor on an initial device. assert res_nncf.device == nncf_tensor_a.device @pytest.mark.parametrize("op_name", COMPARISON_OPERATOR_MAP.keys()) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 579db6d1ebc..692f51c42d3 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1807,6 +1807,33 @@ def test_nf4_quantization_mid_quant(weight, scale): np.testing.assert_allclose(nf4_quant.data, ref_nf4_quant.data, atol=0, rtol=0) +@pytest.mark.parametrize( + "input_val,expected_val,description", + [ + (-7.0, -6.0, "Lower than quantile range"), + (7.0, 6.0, "Higher than quantile range"), + (-5.0, -4.0, "Should pick nearest EVEN index (index 2: -4.0)"), + (-3.5, -4.0, "Should pick nearest EVEN index (index 2: -4.0)"), + (1.75, 2.0, "Should pick nearest EVEN index (index 12: 2.0)"), + (2.5, 2.0, "Should pick nearest EVEN index (index 12: 2.0)"), + (-4.0, -4.0, "Exactly on a quantile"), + (0.0, 0.0, "Value 0.0 is on quantile boundary"), + (-0.0, 0.0, "Value -0.0 is on quantile boundary"), + (-0.25, 0.0, "Should round up, 0.0 (even index)"), + (0.25, 0.0, "Should round down, 0.0 (even index)"), + (-0.49, -0.5, "Closer to -0.5"), + (-0.51, -0.5, "Closer to -0.5)"), + ], +) +def test_mxfp4_quantization_edge_cases(input_val, expected_val, description): + norm_weight = Tensor(np.array([input_val], dtype=np.float32)) + result = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.MXFP4) + + assert result.data[0] == expected_val, ( + f"{description}: Expected {expected_val}, got {result.data[0]} for input value {input_val}" + ) + + @pytest.mark.parametrize( "codebook", [