From 3abe6e3f2038948d081412da720240931fecf8d4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 20 Nov 2025 16:50:06 +0100 Subject: [PATCH 1/5] Initial commit --- .../openvino/optimized_functions/functions.py | 7 +- .../openvino/optimized_functions/models.py | 24 +-- .../algorithms/weight_compression/config.py | 37 ++++- .../weight_compression/constants.py | 9 ++ .../weight_compression/fp8_conversion.py | 149 ++++++++++++++++++ .../weight_compression/openvino_backend.py | 31 +--- .../weight_compression/weight_lowering.py | 78 +++++---- src/nncf/tensor/definitions.py | 2 + src/nncf/tensor/functions/openvino_numeric.py | 1 + .../test_compression_functions.py | 29 ++-- 10 files changed, 270 insertions(+), 97 deletions(-) create mode 100644 src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py diff --git a/src/nncf/openvino/optimized_functions/functions.py b/src/nncf/openvino/optimized_functions/functions.py index cb9845bfc22..0c05be614ad 100644 --- a/src/nncf/openvino/optimized_functions/functions.py +++ b/src/nncf/openvino/optimized_functions/functions.py @@ -116,7 +116,7 @@ def do_float_quantization( :param precomputed_scale: Optional precomputed scale. :return: Returns quantized weight tensor and corresponding scale tensor. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4] + assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK] weight_shape = weight.shape scale_shape = None if precomputed_scale is None else precomputed_scale.shape @@ -128,8 +128,7 @@ def do_float_quantization( if weight.backend == TensorBackend.ov: # Return ov tensors in target precision to seamlessly insert them into openvino model later ov_model_params.return_ov_tensors = True - weight_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 - ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype}) + ov_model_params.output_dtypes.update({"compressed_weight": config.compression_dtype}) model = get_float_quantization_model( ov_model_params, @@ -234,7 +233,7 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4] + assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK] # When reduction axes are not provided, assuming that the weights are already reshaped if config.group_size != -1 and reduction_axes is not None: diff --git a/src/nncf/openvino/optimized_functions/models.py b/src/nncf/openvino/optimized_functions/models.py index 64be0e0b134..ba8e9a6c99d 100644 --- a/src/nncf/openvino/optimized_functions/models.py +++ b/src/nncf/openvino/optimized_functions/models.py @@ -31,6 +31,7 @@ from nncf.openvino.graph.node_utils import convert_op from nncf.openvino.graph.node_utils import non_convertable_divide_op from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES from nncf.tensor import Tensor from nncf.tensor import TensorDataType from nncf.tensor.functions.openvino_numeric import DTYPE_MAP as DTYPE_MAP_OV @@ -571,7 +572,7 @@ def _build_float_quantization_model( reduction_axes: Optional[ReductionAxes] = None, return_nodes: bool = False, ) -> Union[ModelCallable, ModelAsNodes]: - assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4] + assert config.mode not in [CompressWeightsMode.CB4_F8E4M3, CompressWeightsMode.CODEBOOK] default_input_dtypes = {"scale": TensorDataType.float32} default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32} @@ -597,7 +598,12 @@ def _build_float_quantization_model( ) # Validate output dtypes - valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1] + valid_compressed_weight_dtypes = [ + TensorDataType.float32, + TensorDataType.nf4, + TensorDataType.f4e2m1, + TensorDataType.f8e4m3, + ] if compressed_weight_dtype not in valid_compressed_weight_dtypes: msg = ( f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " @@ -625,23 +631,17 @@ def _build_float_quantization_model( eps = np.finfo(np.float32).eps scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) - # Equals 1.0 for NF4 - FP_MAX_VALS = { - CompressWeightsMode.MXFP4: 6.0, - CompressWeightsMode.FP4: 6.0, - } - if config.mode in FP_MAX_VALS: - scale = divide_op(scale, opset.constant(FP_MAX_VALS[config.mode], ov.Type.f32)) + if config.compression_dtype != TensorDataType.nf4: + scale = divide_op(scale, opset.constant(FP_MAX_VALUES[config.compression_dtype], ov.Type.f32)) - if config.mode == CompressWeightsMode.MXFP4: + if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32)) scale = opset.ceil(scale) scale = opset.clamp(scale, -127.0, 127.0) scale = opset.power(opset.constant(2.0, ov.Type.f32), scale) compressed_weight = divide_op(weight, scale) - target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1 - compressed_weight = convert_op(compressed_weight, target_dtype) + compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[config.compression_dtype]) compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) ov_results = [compressed_weight] diff --git a/src/nncf/quantization/algorithms/weight_compression/config.py b/src/nncf/quantization/algorithms/weight_compression/config.py index 2af1fc86969..ebaf0440d4c 100644 --- a/src/nncf/quantization/algorithms/weight_compression/config.py +++ b/src/nncf/quantization/algorithms/weight_compression/config.py @@ -46,7 +46,17 @@ def num_bits(self): """ :return: number of bits that is used for storing a single quantized value in the given mode. """ - return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4 + return ( + 8 + if self.mode + in [ + CompressWeightsMode.INT8_SYM, + CompressWeightsMode.INT8_ASYM, + CompressWeightsMode.FP8_E4M3, + CompressWeightsMode.MXFP8_E4M3, + ] + else 4 + ) @property def is_asym_mode(self): @@ -74,6 +84,31 @@ def is_codebook(self): """ return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + @property + def compression_dtype(self) -> TensorDataType: + """ + :return: data type that is used to store compressed weights. + """ + if self.is_codebook: + n_quants = self.codebook_values.size + if n_quants <= 16: + return TensorDataType.uint4 + if n_quants <= 256: + return TensorDataType.uint8 + return TensorDataType.uint16 + dtype_per_mode = { + CompressWeightsMode.INT4_SYM: TensorDataType.int4, + CompressWeightsMode.INT4_ASYM: TensorDataType.uint4, + CompressWeightsMode.INT8_ASYM: TensorDataType.uint8, + CompressWeightsMode.INT8_SYM: TensorDataType.int8, + CompressWeightsMode.NF4: TensorDataType.nf4, + CompressWeightsMode.FP4: TensorDataType.f4e2m1, + CompressWeightsMode.MXFP4: TensorDataType.f4e2m1, + CompressWeightsMode.FP8_E4M3: TensorDataType.f8e4m3, + CompressWeightsMode.MXFP8_E4M3: TensorDataType.f8e4m3, + } + return dtype_per_mode[self.mode] + def get_numpy_codebook(self): return self.codebook_values.as_numpy_tensor() diff --git a/src/nncf/quantization/algorithms/weight_compression/constants.py b/src/nncf/quantization/algorithms/weight_compression/constants.py index 3e8a0dd9a3f..d01e880be40 100644 --- a/src/nncf/quantization/algorithms/weight_compression/constants.py +++ b/src/nncf/quantization/algorithms/weight_compression/constants.py @@ -11,6 +11,8 @@ import numpy as np +from nncf.tensor import TensorDataType + NF4_QUANTILES = np.array( [ -1.0, @@ -101,3 +103,10 @@ CENTER_OF_F4E2M1_QUANTILES = (F4E2M1_QUANTILES[1:] + F4E2M1_QUANTILES[:-1]) / 2 + + +FP_MAX_VALUES = { + TensorDataType.nf4: 1.0, + TensorDataType.f4e2m1: 6.0, + TensorDataType.f8e4m3: 448.0, +} diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py new file mode 100644 index 00000000000..d78ca4df6aa --- /dev/null +++ b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +F8E4M3_LUT = np.array( + [ + 0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625, 0.01171875, 0.013671875, + 0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625, 0.02734375, 0.029296875, + 0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875, 0.05859375, + 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, + 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, + 0.25, 0.28125, 0.3125, 0.34375, 0.375, 0.40625, 0.4375, 0.46875, + 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, + 1.0, 1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875, + 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, + 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, + 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, + 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, + 64.0, 72.0, 80.0, 88.0, 96.0, 104.0, 112.0, 120.0, + 128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0, + 256.0, 288.0, 320.0, 352.0, 384.0, 416.0, 448.0, np.nan, + ], + dtype=np.float32, +) + + +def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: + """Exact port of ov::f16_to_f8e4m3_bits for a single float16 bit-pattern.""" + # f16 layout + f16_s_mask = 0x8000 + f16_e_mask = 0x7C00 + f16_e_bias = 15 + f16_e_size = 5 + f16_m_mask = 0x03FF + f16_m_size = 10 + + # f8 e4m3 layout + f8e4m3_s_mask = 0x80 + f8e4m3_e_size = 4 + f8e4m3_e_mask = 0x78 + f8e4m3_e_bias = 7 + f8e4m3_e_max = 0x0F + f8e4m3_m_size = 3 + f8e4m3_m_mask = 0x07 + + byte_shift = 8 + + # f8 masks in uint16 domain + f8_e_mask = f8e4m3_e_mask << byte_shift # 0x7800 + f8_m_mask = f8e4m3_m_mask << byte_shift # 0x0700 + f8_m_hidden_one_mask = 0x0800 # hidden 1 for subnormals + + # rounding constants (same as C++) + round_half = 0x01FF + round_norm = 0x007F + round_even = 0x0080 + round_odd = 0x0180 + + # min exponent for which subnormals are representable + f8_e_subnormal_min = -10 + + inp = int(h_bits) & 0xFFFF + + # sign bit: f16 sign -> f8 sign position (bit 15 -> bit 7) + f8_bits = (inp & f16_s_mask) >> byte_shift + + f16_e_field = inp & f16_e_mask + + if f16_e_field == f16_e_mask: + # f16 NaN / Inf -> f8 NaN (no Inf) + f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask) + elif f16_e_field != 0: + # normalized f16 + f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias) + # *** IMPORTANT FIX: shift by (f16_e_size - f8e4m3_e_size) = 5 - 4 = 1 *** + fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size) + + # normalized f8 part (exp >= 0) + if f8_biased_exp >= 0: + if (fractional & round_half) == round_odd or (fractional & round_norm) != 0: + fractional += round_even + if (fractional & f8_e_mask) != 0: + f8_biased_exp += 1 + fractional &= f8_m_mask + + # now set exponent & mantissa + if f8_biased_exp > f8e4m3_e_max: + # overflow -> NaN (no Inf) + f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask) + elif f8_biased_exp > 0: + # normalized f8 + exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size + f8_bits |= exp_field + f8_bits |= (fractional >> byte_shift) + else: + # subnormal f8 + fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) + f8_exp = f8_biased_exp - f8e4m3_e_bias + shift = 1 - f8_exp + sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1) + sticky = 1 if (fractional & sticky_mask) != 0 else 0 + + fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp)) + + if (((fractional & round_half) == round_odd and sticky == 0) or + (fractional & round_norm) != 0 or sticky != 0): + fractional += round_even + + f8_bits |= (fractional >> byte_shift) + else: + # f16 zero / subnormal -> sign + zero exponent/mantissa + # (f8_bits already contains the sign) + pass + + return f8_bits & 0xFF + + +_f16_to_f8e4m3_bits_vec = np.vectorize(_f16_to_f8e4m3_bits_scalar, otypes=[np.uint8]) + + +def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray: + """ + Bit-exact to ov::float8_e4m3(float): + float32 -> float16 -> f8e4m3 bits -> float via LUT + """ + x = np.asarray(x, dtype=np.float32) + x_f16 = x.astype(np.float16) + h_bits = x_f16.view(np.uint16) + + f8_bits = _f16_to_f8e4m3_bits_vec(h_bits) + + # Decode exactly like C++: LUT for magnitude + sign bit + idx = f8_bits & 0x7F + mag = F8E4M3_LUT[idx.astype(np.int32)] + + sign = np.where((f8_bits & 0x80) != 0, -1.0, 1.0) + out = sign * mag + return out.astype(np.float32) diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 3ec241b36c6..50483e20690 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -64,6 +64,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight from nncf.tensor import Tensor from nncf.tensor.definitions import TensorDataType +from nncf.tensor.functions.openvino_numeric import DTYPE_MAP from nncf.tensor.functions.openvino_numeric import DTYPE_MAP_REV @@ -223,32 +224,11 @@ def _create_compression_subgraph( should_add_convert_node: bool, precomputed_compressed_weight: Optional[CompressedWeight] = None, ): - scale_dtype = ov.Type.f16 - if compression_config.mode == CompressWeightsMode.NF4: - compression_dtype = ov.Type.nf4 - elif compression_config.mode == CompressWeightsMode.MXFP4: - compression_dtype = ov.Type.f4e2m1 + compression_dtype = DTYPE_MAP[compression_config.compression_dtype] + if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: scale_dtype = ov.Type.f8e8m0 - elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3: - compression_dtype = ov.Type.f8e4m3 - scale_dtype = ov.Type.f8e8m0 - elif compression_config.mode == CompressWeightsMode.FP8_E4M3: - compression_dtype = ov.Type.f8e4m3 - elif compression_config.mode == CompressWeightsMode.FP4: - compression_dtype = ov.Type.f4e2m1 - elif compression_config.mode == CompressWeightsMode.INT4_SYM: - compression_dtype = ov.Type.i4 - elif compression_config.mode == CompressWeightsMode.INT4_ASYM: - compression_dtype = ov.Type.u4 - elif compression_config.mode == CompressWeightsMode.INT8_SYM: - compression_dtype = ov.Type.i8 - elif compression_config.mode == CompressWeightsMode.INT8_ASYM: - compression_dtype = ov.Type.u8 - elif compression_config.is_codebook: - compression_dtype = None else: - msg = f"{compression_config.mode.value} is not supported." - raise nncf.ParameterNotSupportedError(msg) + scale_dtype = ov.Type.f16 original_shape = weight.shape @@ -261,8 +241,7 @@ def _create_compression_subgraph( ) if compression_config.is_codebook: - n_quants = compressed_weight.codebook.size - 1 - compression_dtype = ov.Type.u16 if n_quants > 255 else (ov.Type.u8 if n_quants > 15 else ov.Type.u4) + compression_dtype = DTYPE_MAP[compression_config.compression_dtype] converted_const = create_ov_codebook_subgraph( codebook=compressed_weight.codebook if compression_config.mode == CompressWeightsMode.CODEBOOK diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index d0c96e952fb..4542368d4f4 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -22,6 +22,7 @@ from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_F4E2M1_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import F4E2M1_QUANTILES +from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point @@ -41,6 +42,8 @@ CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4, + CompressWeightsMode.FP8_E4M3, + CompressWeightsMode.MXFP8_E4M3, ) MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 @@ -106,15 +109,9 @@ def calculate_float_quantization_params( weight = weight.astype(TensorDataType.float32) scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) - FP_MAX_VALS = { - CompressWeightsMode.MXFP4: 6.0, - CompressWeightsMode.MXFP8_E4M3: 448.0, - CompressWeightsMode.FP4: 6.0, - CompressWeightsMode.FP8_E4M3: 448.0, - } - if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()): - if config.mode in FP_MAX_VALS: - max_val = FP_MAX_VALS[config.mode] + if config.mode != CompressWeightsMode.NF4: + if config.compression_dtype in FP_MAX_VALUES: + max_val = FP_MAX_VALUES[config.compression_dtype] else: max_val = fns.max(fns.abs(config.get_numpy_codebook())) scale = scale / max_val @@ -157,19 +154,19 @@ def do_float_quantization( config: WeightCompressionConfig, reduction_axes: Optional[ReductionAxes] = None, precomputed_scale: Optional[Tensor] = None, -) -> tuple[Tensor, Tensor, Tensor]: +) -> tuple[Tensor, Tensor, Optional[Tensor]]: """ - Computes quantization scale if not provided, - and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP4, FP8_E4M3) weight quantization. - NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6]. - For MXFP8_E4M3, FP8_E4M3, FP4 and CODEBOOK currently returns normalized weight without quantization. + Computes quantization scale if not provided and performs corresponding weight quantization. + NF4 format uses 16 levels in [-1, 1] range, MXFP4 uses 16 levels in [-6, 6], and MXFP8_E4M3 uses 256 levels + in [-448, 448]. + For CODEBOOK quantization currently returns normalized weight without quantization. :param weight: Weight array to compress. :param config: Weight compression configuration. :param reduction_axes: Axes, along which to reduce (collect) different statistics. :param precomputed_scale: Optional precomputed scale. - :return: Returns quantized (for MXFP8_E4M3, FP4, FP8_E4M3 and codebook normalized) - weight tensor and corresponding scale tensor and optional indexes for codebook. + :return: Returns quantized (for codebook normalized) weight tensor and corresponding scale tensor and optional + indexes for codebook. """ assert not config.is_integer @@ -193,20 +190,18 @@ def do_float_quantization( if scale is None: scale = calculate_float_quantization_params(weight, reduction_axes, config) norm_weight = _calculate_normalized_weight(weight, scale) - if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]: - if original_weight_backend == TensorBackend.ov: - # Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor - target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 - compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype) - else: - compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode) - elif config.is_codebook: + if config.is_codebook: compressed_weight, indexes = _calculate_codebook_quantized_weight( norm_weight, quantiles=config.get_numpy_codebook() ) return compressed_weight, scale, indexes + + if original_weight_backend == TensorBackend.ov: + # Can convert through OpenVINO and return OpenVINO-native low-precision tensor + compressed_weight = norm_weight.as_openvino_tensor().astype(config.compression_dtype) else: - compressed_weight = norm_weight + compressed_weight = _calculate_float_quantized_weight(norm_weight, config.compression_dtype) + return compressed_weight, scale, None @@ -219,7 +214,6 @@ def float_quantize_dequantize_weight( ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]: """ First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values. - MXFP8_E4M3 and FP8_E4M3 modes currently are not supported. :param weight: The weight tensor to quantize-dequantize. :param config: Compression configuration. @@ -228,14 +222,6 @@ def float_quantize_dequantize_weight( :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale. :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale. """ - assert config.mode in [ - CompressWeightsMode.NF4, - CompressWeightsMode.MXFP4, - CompressWeightsMode.FP4, - CompressWeightsMode.CODEBOOK, - CompressWeightsMode.CB4_F8E4M3, - ] - # Optimized implementation if _can_run_optimized(weight, config.mode): from nncf.openvino.optimized_functions import ( @@ -523,22 +509,30 @@ def integer_quantize_dequantize_weight( return decompressed_weight -def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor: +def _calculate_float_quantized_weight(norm_weight: Tensor, compression_dtype: TensorDataType) -> Tensor: """ - Performs float (currently NF4 or MXFP4) quantization. Look-up table is used to "round" or "quantize" to the - closest quant. + Performs float quantization. Look-up table is used to "round" or "quantize" to the closest quant. :param norm_weight: Normalized weight tensor to quantize. - :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants. + :param compression_dtype: Target floating-point data type for quantization. + :return: Tensor with floating-point values, where each of them corresponds to 1 out of N quants. """ - assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4] - quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else F4E2M1_QUANTILES - quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_F4E2M1_QUANTILES + assert compression_dtype in [TensorDataType.f8e4m3, TensorDataType.f4e2m1, TensorDataType.nf4] + + if compression_dtype == TensorDataType.f8e4m3: + from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3_values + + quantiles_np = fp32_to_fp8e4m3_values(norm_weight.as_numpy_tensor().data) + return fns.from_numpy(quantiles_np, backend=norm_weight.backend) + + is_nf4 = compression_dtype == TensorDataType.nf4 + quantiles_np = NF4_QUANTILES if is_nf4 else F4E2M1_QUANTILES + quantile_centers_np = CENTER_OF_NF4_QUANTILES if is_nf4 else CENTER_OF_F4E2M1_QUANTILES quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend) indexes = fns.searchsorted(quantile_centers, norm_weight) quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend) - if mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]: + if compression_dtype == TensorDataType.f4e2m1: # If in-between two quantiles, round to the nearest even quantile. shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1) dist_left = fns.abs(norm_weight - quantiles[indexes]) diff --git a/src/nncf/tensor/definitions.py b/src/nncf/tensor/definitions.py index b6b73535dc5..29c89d0b473 100644 --- a/src/nncf/tensor/definitions.py +++ b/src/nncf/tensor/definitions.py @@ -49,6 +49,7 @@ class TensorDataType(StrEnum): int8 = auto() int32 = auto() int64 = auto() + uint16 = auto() uint8 = auto() uint4 = auto() int4 = auto() @@ -81,6 +82,7 @@ def itemsize(self) -> int: TensorDataType.f8e5m2: 8, TensorDataType.int8: 8, TensorDataType.uint8: 8, + TensorDataType.uint16: 16, TensorDataType.float16: 16, TensorDataType.bfloat16: 16, TensorDataType.float32: 32, diff --git a/src/nncf/tensor/functions/openvino_numeric.py b/src/nncf/tensor/functions/openvino_numeric.py index aadf75a8ccd..8794cdb6a68 100644 --- a/src/nncf/tensor/functions/openvino_numeric.py +++ b/src/nncf/tensor/functions/openvino_numeric.py @@ -34,6 +34,7 @@ TensorDataType.int8: ov.Type.i8, TensorDataType.int32: ov.Type.i32, TensorDataType.int64: ov.Type.i64, + TensorDataType.uint16: ov.Type.u16, TensorDataType.uint8: ov.Type.u8, TensorDataType.uint4: ov.Type.u4, TensorDataType.int4: ov.Type.i4, diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 6a103dab25d..36682242cf3 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -74,7 +74,15 @@ class QuantizationTask(Enum): WeightCompressionConfig(CompressWeightsMode.MXFP4, group_size=32), ] -COMPRESSION_CONFIGS = INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS +FP8_COMPRESSION_CONFIGS = [ + WeightCompressionConfig(CompressWeightsMode.FP8_E4M3), + WeightCompressionConfig(CompressWeightsMode.FP8_E4M3, group_size=2), + WeightCompressionConfig(CompressWeightsMode.MXFP8_E4M3, group_size=32), +] + +COMPRESSION_CONFIGS = ( + INT8_COMPRESSION_CONFIGS + INT4_COMPRESSION_CONFIGS + FP4_COMPRESSION_CONFIGS + FP8_COMPRESSION_CONFIGS +) WEIGHT_SHAPE = (10000, 32) @@ -367,8 +375,10 @@ def get_input_node_data(node: ov.Node, input_id: int) -> Tensor: CompressWeightsMode.INT8_SYM, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4, + CompressWeightsMode.FP8_E4M3, + CompressWeightsMode.MXFP8_E4M3, ]: - pytest.skip("Data-aware compression is not supported for INT8, MXFP4, FP4 modes.") + pytest.skip("Data-aware compression is not supported for INT8, MXFP4, FP4, MXFP8, FP8 modes.") if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM]: if weight_dtype in [TensorDataType.f8e4m3, TensorDataType.f8e5m2]: pytest.skip("INT8 compression is not supported for f8 dtypes.") @@ -465,29 +475,24 @@ def _check_backends_and_dtypes( # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed # zero point must be in ov backend and have (u)int4/nf4/f4e2m1 dtypes in order to be able to insert them into # OV model without re-packing - if config.is_integer: - ref_dtype = TensorDataType.uint4 if config.is_asym_mode else TensorDataType.int4 - else: - ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 assert compressed_weight.backend == TensorBackend.ov - assert compressed_weight.dtype == ref_dtype + assert compressed_weight.dtype == config.compression_dtype if config.is_asym_mode and not precompute_s_zp: assert zero_point.backend == TensorBackend.ov assert zero_point.dtype == TensorDataType.uint4 else: if quantization_task != QuantizationTask.Q_DQ: # Otherwise, for integer compression, compressed weight and zero point must be returned in numpy backend, - # compressed weight must be of (u)int8, zero point -- in int32; for nf4/f4e2m1 compression, the resulting + # compressed weight must be of (u)int8, zero point -- in int32; for float compression, the resulting # data type and backend depends on the input tensor backend. if config.is_integer: ref_backend = TensorBackend.numpy ref_dtype = TensorDataType.uint8 if config.is_asym_mode else TensorDataType.int8 else: ref_backend = weight_tensor_backend - if weight_tensor_backend == TensorBackend.ov: - ref_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1 - else: - ref_dtype = TensorDataType.float32 + ref_dtype = ( + config.compression_dtype if weight_tensor_backend == TensorBackend.ov else TensorDataType.float32 + ) assert compressed_weight.backend == ref_backend assert compressed_weight.dtype == ref_dtype if config.is_asym_mode and not precompute_s_zp: From 948fad97053f7e331a5a5dc1fac7bb63358251b6 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 20 Nov 2025 17:26:02 +0100 Subject: [PATCH 2/5] Fix tests --- .../weight_compression/fp8_conversion.py | 57 ++++++++++--------- src/nncf/tensor/functions/numpy_numeric.py | 1 + src/nncf/tensor/functions/torch_numeric.py | 1 + .../template_test_nncf_tensor.py | 1 + .../quantization/test_weights_compression.py | 36 +++++++++++- 5 files changed, 67 insertions(+), 29 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py index d78ca4df6aa..acc719b506f 100644 --- a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py +++ b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py @@ -11,28 +11,29 @@ import numpy as np - +# fmt: off F8E4M3_LUT = np.array( [ - 0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625, 0.01171875, 0.013671875, - 0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625, 0.02734375, 0.029296875, - 0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875, 0.05859375, - 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, - 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, - 0.25, 0.28125, 0.3125, 0.34375, 0.375, 0.40625, 0.4375, 0.46875, - 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, - 1.0, 1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875, - 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, - 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, - 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, - 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, - 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, - 64.0, 72.0, 80.0, 88.0, 96.0, 104.0, 112.0, 120.0, - 128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0, - 256.0, 288.0, 320.0, 352.0, 384.0, 416.0, 448.0, np.nan, + 0.0, 0.001953125, 0.00390625, 0.005859375, 0.0078125, 0.009765625, 0.01171875, 0.013671875, # noqa + 0.015625, 0.017578125, 0.01953125, 0.021484375, 0.0234375, 0.025390625, 0.02734375, 0.029296875, # noqa + 0.03125, 0.03515625, 0.0390625, 0.04296875, 0.046875, 0.05078125, 0.0546875, 0.05859375, # noqa + 0.0625, 0.0703125, 0.078125, 0.0859375, 0.09375, 0.1015625, 0.109375, 0.1171875, # noqa + 0.125, 0.140625, 0.15625, 0.171875, 0.1875, 0.203125, 0.21875, 0.234375, # noqa + 0.25, 0.28125, 0.3125, 0.34375, 0.375, 0.40625, 0.4375, 0.46875, # noqa + 0.5, 0.5625, 0.625, 0.6875, 0.75, 0.8125, 0.875, 0.9375, # noqa + 1.0, 1.125, 1.25, 1.375, 1.5, 1.625, 1.75, 1.875, # noqa + 2.0, 2.25, 2.5, 2.75, 3.0, 3.25, 3.5, 3.75, # noqa + 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, # noqa + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, # noqa + 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, # noqa + 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, # noqa + 64.0, 72.0, 80.0, 88.0, 96.0, 104.0, 112.0, 120.0, # noqa + 128.0, 144.0, 160.0, 176.0, 192.0, 208.0, 224.0, 240.0, # noqa + 256.0, 288.0, 320.0, 352.0, 384.0, 416.0, 448.0, np.nan, # noqa ], dtype=np.float32, ) +# fmt: on def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: @@ -46,7 +47,6 @@ def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: f16_m_size = 10 # f8 e4m3 layout - f8e4m3_s_mask = 0x80 f8e4m3_e_size = 4 f8e4m3_e_mask = 0x78 f8e4m3_e_bias = 7 @@ -57,9 +57,9 @@ def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: byte_shift = 8 # f8 masks in uint16 domain - f8_e_mask = f8e4m3_e_mask << byte_shift # 0x7800 - f8_m_mask = f8e4m3_m_mask << byte_shift # 0x0700 - f8_m_hidden_one_mask = 0x0800 # hidden 1 for subnormals + f8_e_mask = f8e4m3_e_mask << byte_shift # 0x7800 + f8_m_mask = f8e4m3_m_mask << byte_shift # 0x0700 + f8_m_hidden_one_mask = 0x0800 # hidden 1 for subnormals # rounding constants (same as C++) round_half = 0x01FF @@ -79,7 +79,7 @@ def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: if f16_e_field == f16_e_mask: # f16 NaN / Inf -> f8 NaN (no Inf) - f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask) + f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask elif f16_e_field != 0: # normalized f16 f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias) @@ -97,12 +97,12 @@ def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: # now set exponent & mantissa if f8_biased_exp > f8e4m3_e_max: # overflow -> NaN (no Inf) - f8_bits |= (f8e4m3_e_mask | f8e4m3_m_mask) + f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask elif f8_biased_exp > 0: # normalized f8 exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size f8_bits |= exp_field - f8_bits |= (fractional >> byte_shift) + f8_bits |= fractional >> byte_shift else: # subnormal f8 fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) @@ -113,11 +113,14 @@ def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp)) - if (((fractional & round_half) == round_odd and sticky == 0) or - (fractional & round_norm) != 0 or sticky != 0): + if ( + ((fractional & round_half) == round_odd and sticky == 0) + or (fractional & round_norm) != 0 + or sticky != 0 + ): fractional += round_even - f8_bits |= (fractional >> byte_shift) + f8_bits |= fractional >> byte_shift else: # f16 zero / subnormal -> sign + zero exponent/mantissa # (f8_bits already contains the sign) diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index 7f9c23034ea..ae62c40edee 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -37,6 +37,7 @@ TensorDataType.int32: np.dtype(np.int32), TensorDataType.int64: np.dtype(np.int64), TensorDataType.uint8: np.dtype(np.uint8), + TensorDataType.uint16: np.dtype(np.uint16), } DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index 0a094284f25..7390d36178b 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -35,6 +35,7 @@ TensorDataType.int32: torch.int32, TensorDataType.int64: torch.int64, TensorDataType.uint8: torch.uint8, + TensorDataType.uint16: torch.uint16, } DEVICE_MAP = {TensorDeviceType.CPU: "cpu", TensorDeviceType.GPU: "cuda"} diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index 14494e8efd7..2280dc8d9a9 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -2138,6 +2138,7 @@ def test_fn_eye(self, n, m, ref): in [ TensorDataType.int4, TensorDataType.uint4, + TensorDataType.uint16, TensorDataType.nf4, TensorDataType.f4e2m1, TensorDataType.f8e8m0, diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 5a38bd3a79b..4d4a0555368 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1442,6 +1442,38 @@ def test_int_compressed_weighs_range(mode, data): 8.0, ], }, + CompressWeightsMode.FP8_E4M3: { + "neg": [ + -8.0, + -6.857143402099609, + -5.714285850524902, + -5.142857551574707, + -4.0, + -2.857142925262451, + -2.0, + -1.0, + 0.0, + ], + "pos": [0.0, 1.0, 2.0, 2.857142925262451, 4.0, 5.142857551574707, 5.714285850524902, 6.857143402099609, 8.0], + "neg-pos": [ + -8.0, + -6.857143402099609, + -5.714285850524902, + -5.142857551574707, + -4.0, + -2.857142925262451, + -2.0, + -1.0, + 0.0, + 1.0, + 2.0, + 2.857142925262451, + 4.0, + 5.142857551574707, + 5.714285850524902, + 6.857143402099609, + ], + }, } @@ -2000,7 +2032,7 @@ def test_nf4_quantization_mid_quant(weight, scale): scale = Tensor(scale) # norm_weight equals -0.8480964 (one bit away from the first NF4 quantile center) norm_weight = _calculate_normalized_weight(weight, scale) - nf4_quant = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.NF4) + nf4_quant = _calculate_float_quantized_weight(norm_weight, TensorDataType.nf4) norm_weight_ov_backend = Tensor(ov.Tensor(norm_weight.data, norm_weight.shape, ov.Type.f32)) ref_nf4_quant = norm_weight_ov_backend.astype(TensorDataType.nf4).as_numpy_tensor() @@ -2028,7 +2060,7 @@ def test_nf4_quantization_mid_quant(weight, scale): ) def test_mxfp4_quantization_edge_cases(input_val, expected_val, description): norm_weight = Tensor(np.array([input_val], dtype=np.float32)) - result = _calculate_float_quantized_weight(norm_weight, CompressWeightsMode.MXFP4) + result = _calculate_float_quantized_weight(norm_weight, TensorDataType.f4e2m1) assert result.data[0] == expected_val, ( f"{description}: Expected {expected_val}, got {result.data[0]} for input value {input_val}" From 0266a3e2d799d09b24016e617ae475f48dd88ea4 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 27 Nov 2025 16:59:16 +0100 Subject: [PATCH 3/5] Add pure numpy conversion --- .../weight_compression/fp8_conversion.py | 199 +++++++++++------- .../weight_compression/weight_lowering.py | 4 +- 2 files changed, 123 insertions(+), 80 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py index acc719b506f..1a93cf65cf8 100644 --- a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py +++ b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py @@ -36,103 +36,146 @@ # fmt: on -def _f16_to_f8e4m3_bits_scalar(h_bits: int) -> int: - """Exact port of ov::f16_to_f8e4m3_bits for a single float16 bit-pattern.""" +def f16_to_f8e4m3_bits_numpy(x: np.ndarray) -> np.ndarray: + """ + Convert an array of f16 values (or their uint16 bit patterns) to + f8e4m3 bit patterns (uint8) using a fully vectorized NumPy + port of _f16_to_f8e4m3_bits_scalar. + """ # f16 layout - f16_s_mask = 0x8000 - f16_e_mask = 0x7C00 + f16_s_mask = np.uint16(0x8000) + f16_e_mask = np.uint16(0x7C00) f16_e_bias = 15 f16_e_size = 5 - f16_m_mask = 0x03FF + f16_m_mask = np.uint16(0x03FF) f16_m_size = 10 # f8 e4m3 layout f8e4m3_e_size = 4 - f8e4m3_e_mask = 0x78 + f8e4m3_e_mask = np.uint16(0x78) f8e4m3_e_bias = 7 f8e4m3_e_max = 0x0F f8e4m3_m_size = 3 - f8e4m3_m_mask = 0x07 + f8e4m3_m_mask = np.uint16(0x07) byte_shift = 8 # f8 masks in uint16 domain - f8_e_mask = f8e4m3_e_mask << byte_shift # 0x7800 - f8_m_mask = f8e4m3_m_mask << byte_shift # 0x0700 - f8_m_hidden_one_mask = 0x0800 # hidden 1 for subnormals + f8_e_mask = np.uint16(f8e4m3_e_mask << byte_shift) # 0x7800 + f8_m_mask = np.uint16(f8e4m3_m_mask << byte_shift) # 0x0700 + f8_m_hidden_one_mask = np.uint16(0x0800) # hidden 1 for subnormals - # rounding constants (same as C++) - round_half = 0x01FF - round_norm = 0x007F - round_even = 0x0080 - round_odd = 0x0180 + # rounding constants + round_half = np.uint16(0x01FF) + round_norm = np.uint16(0x007F) + round_even = np.uint16(0x0080) + round_odd = np.uint16(0x0180) # min exponent for which subnormals are representable f8_e_subnormal_min = -10 - inp = int(h_bits) & 0xFFFF - # sign bit: f16 sign -> f8 sign position (bit 15 -> bit 7) - f8_bits = (inp & f16_s_mask) >> byte_shift - - f16_e_field = inp & f16_e_mask - - if f16_e_field == f16_e_mask: - # f16 NaN / Inf -> f8 NaN (no Inf) - f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask - elif f16_e_field != 0: - # normalized f16 - f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias) - # *** IMPORTANT FIX: shift by (f16_e_size - f8e4m3_e_size) = 5 - 4 = 1 *** - fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size) - - # normalized f8 part (exp >= 0) - if f8_biased_exp >= 0: - if (fractional & round_half) == round_odd or (fractional & round_norm) != 0: - fractional += round_even - if (fractional & f8_e_mask) != 0: - f8_biased_exp += 1 - fractional &= f8_m_mask - - # now set exponent & mantissa - if f8_biased_exp > f8e4m3_e_max: - # overflow -> NaN (no Inf) - f8_bits |= f8e4m3_e_mask | f8e4m3_m_mask - elif f8_biased_exp > 0: - # normalized f8 - exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size - f8_bits |= exp_field - f8_bits |= fractional >> byte_shift - else: - # subnormal f8 - fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) - f8_exp = f8_biased_exp - f8e4m3_e_bias - shift = 1 - f8_exp - sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1) - sticky = 1 if (fractional & sticky_mask) != 0 else 0 - - fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp)) - - if ( - ((fractional & round_half) == round_odd and sticky == 0) - or (fractional & round_norm) != 0 - or sticky != 0 - ): - fractional += round_even - - f8_bits |= fractional >> byte_shift - else: - # f16 zero / subnormal -> sign + zero exponent/mantissa - # (f8_bits already contains the sign) - pass - - return f8_bits & 0xFF - - -_f16_to_f8e4m3_bits_vec = np.vectorize(_f16_to_f8e4m3_bits_scalar, otypes=[np.uint8]) - - -def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray: + f8_bits = ((x & f16_s_mask) >> byte_shift).astype(np.uint16) + + f16_e_field = x & f16_e_mask + is_naninf = f16_e_field == f16_e_mask + is_zero = f16_e_field == 0 + is_normal = (~is_naninf) & (~is_zero) + + nan_pattern = np.uint16(f8e4m3_e_mask | f8e4m3_m_mask) + + # --- Case 1: f16 NaN / Inf -> f8 NaN (no Inf) --- + f8_bits = np.where(is_naninf, f8_bits | nan_pattern, f8_bits) + + # --- Case 2: normalized f16 --- + # f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias) + f8_biased_exp = (f16_e_field >> f16_m_size).astype(np.int32) - (f16_e_bias - f8e4m3_e_bias) + + # fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size) + fractional_norm = ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)).astype(np.uint16) + + exp_ge0 = (f8_biased_exp >= 0) & is_normal + + # Rounding for normalized part (exp >= 0) + # if (fractional & round_half) == round_odd or (fractional & round_norm) != 0: + cond_round_norm = (((fractional_norm & round_half) == round_odd) | ((fractional_norm & round_norm) != 0)) & exp_ge0 + + # fractional += round_even where cond_round_norm + frac_tmp = fractional_norm.astype(np.uint32) + np.where(cond_round_norm, round_even, np.uint16(0)).astype(np.uint32) + fractional_norm = (frac_tmp & 0xFFFF).astype(np.uint16) + + # if (fractional & f8_e_mask) != 0: f8_biased_exp += 1 + exp_inc = np.where(exp_ge0 & ((fractional_norm & f8_e_mask) != 0), 1, 0).astype(np.int32) + f8_biased_exp_after = f8_biased_exp + exp_inc + + # fractional &= f8_m_mask + fractional_norm &= f8_m_mask + + # Overflow / normalized / subnormal classification + overflow_mask = is_normal & (f8_biased_exp_after > f8e4m3_e_max) + normal_mask = is_normal & (f8_biased_exp_after > 0) & (~overflow_mask) + # For subnormals, the scalar code uses f8_biased_exp (after possible increment), + # but increment is only applied when exp >= 0, so exp <= 0 path is unchanged. + subnormal_mask = is_normal & (f8_biased_exp_after <= 0) & (~overflow_mask) + + # --- Overflow -> NaN --- + f8_bits = np.where(overflow_mask, f8_bits | nan_pattern, f8_bits) + + # --- Normalized f8 --- + # exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size + exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype(np.uint16) + mant_norm = (fractional_norm >> byte_shift).astype(np.uint16) + + f8_bits_norm = f8_bits | exp_field | mant_norm + f8_bits = np.where(normal_mask, f8_bits_norm, f8_bits) + + # --- Subnormal f8 --- + # fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) + fractional_sub = f8_m_hidden_one_mask | ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) + + # f8_exp = f8_biased_exp - f8e4m3_e_bias + f8_exp = (f8_biased_exp_after - f8e4m3_e_bias).astype(np.int32) + + # shift = 1 - f8_exp + shift = 1 - f8_exp + + # sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1) + # we avoid invalid shifts by clipping / masking + valid_sub = f8_exp >= f8_e_subnormal_min + shift_pos = np.maximum(shift, 0) + sticky_mask32 = np.where(valid_sub, (np.uint32(1) << shift_pos) - 1, 0).astype(np.uint32) + sticky_mask16 = (sticky_mask32 & np.uint32(0xFFFF)).astype(np.uint16) + + # sticky = 1 if (fractional & sticky_mask) != 0 else 0 + sticky = ((fractional_sub & sticky_mask16) != 0) & valid_sub + + # fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp)) + shift2 = 1 - f8_biased_exp_after + shift2_pos = np.maximum(shift2, 0) + frac_shifted = (fractional_sub.astype(np.uint32) >> shift2_pos).astype(np.uint16) + frac_shifted = np.where(valid_sub, frac_shifted, np.uint16(0)) + + # Rounding for subnormal: + # if (((fractional & round_half) == round_odd and sticky == 0) + # or (fractional & round_norm) != 0 + # or sticky != 0): + cond_round_sub = ( + (((frac_shifted & round_half) == round_odd) & (~sticky)) | ((frac_shifted & round_norm) != 0) | sticky + ) & subnormal_mask + + frac_tmp_sub = frac_shifted.astype(np.uint32) + np.where(cond_round_sub, round_even, np.uint16(0)).astype(np.uint32) + fractional_sub_final = (frac_tmp_sub & 0xFFFF).astype(np.uint16) + + mant_sub = (fractional_sub_final >> byte_shift).astype(np.uint16) + f8_bits = np.where(subnormal_mask, f8_bits | mant_sub, f8_bits) + + # Case: f16 zero / subnormal -> sign + zero exponent/mantissa + # Already handled by initialization + not touching zero_mask entries. + + return (f8_bits & np.uint16(0x00FF)).astype(np.uint8) + + +def fp32_to_fp8e4m3(x: np.ndarray) -> np.ndarray: """ Bit-exact to ov::float8_e4m3(float): float32 -> float16 -> f8e4m3 bits -> float via LUT @@ -141,7 +184,7 @@ def fp32_to_fp8e4m3_values(x: np.ndarray) -> np.ndarray: x_f16 = x.astype(np.float16) h_bits = x_f16.view(np.uint16) - f8_bits = _f16_to_f8e4m3_bits_vec(h_bits) + f8_bits = f16_to_f8e4m3_bits_numpy(h_bits) # Decode exactly like C++: LUT for magnitude + sign bit idx = f8_bits & 0x7F diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 4542368d4f4..219c364555e 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -520,9 +520,9 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, compression_dtype: Te assert compression_dtype in [TensorDataType.f8e4m3, TensorDataType.f4e2m1, TensorDataType.nf4] if compression_dtype == TensorDataType.f8e4m3: - from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3_values + from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3 - quantiles_np = fp32_to_fp8e4m3_values(norm_weight.as_numpy_tensor().data) + quantiles_np = fp32_to_fp8e4m3(norm_weight.as_numpy_tensor().data) return fns.from_numpy(quantiles_np, backend=norm_weight.backend) is_nf4 = compression_dtype == TensorDataType.nf4 From d5cc3f8a7ab4defa4ea9f9d2d8d5bdbd151ec004 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 28 Nov 2025 12:53:04 +0100 Subject: [PATCH 4/5] Rewrite f32->f8 conversion using nncf.Tensor --- .../weight_compression/fp8_conversion.py | 150 +++++++++++------- .../weight_compression/openvino_backend.py | 10 +- .../weight_compression/weight_lowering.py | 3 +- src/nncf/tensor/definitions.py | 2 + src/nncf/tensor/functions/numeric.py | 11 ++ src/nncf/tensor/functions/numpy_numeric.py | 8 +- src/nncf/tensor/functions/openvino_numeric.py | 1 + src/nncf/tensor/functions/torch_numeric.py | 6 + src/nncf/tensor/tensor.py | 12 ++ .../template_test_nncf_tensor.py | 22 ++- .../quantization/test_weights_compression.py | 78 +++++++++ 11 files changed, 234 insertions(+), 69 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py index 1a93cf65cf8..2b8fc7efa79 100644 --- a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py +++ b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py @@ -9,6 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nncf.tensor import Tensor, TensorDataType +from nncf.tensor import functions as fns import numpy as np # fmt: off @@ -36,76 +38,95 @@ # fmt: on -def f16_to_f8e4m3_bits_numpy(x: np.ndarray) -> np.ndarray: +def _f16_to_f8e4m3_bits_numpy(x: Tensor) -> Tensor: """ - Convert an array of f16 values (or their uint16 bit patterns) to - f8e4m3 bit patterns (uint8) using a fully vectorized NumPy - port of _f16_to_f8e4m3_bits_scalar. + Convert a Tensor of f16 values to f8e4m3 bit patterns (uint8). + Adopted from OpenVINO C++ implementation + https://github.com/openvinotoolkit/openvino/blame/master/src/core/src/type/float8_e4m3.cpp + + :param x: Input tensor with float16 values. + :return: Tensor with uint8 values representing f8e4m3 bit patterns. """ + + def to_u16_const(val: int) -> Tensor: + return fns.from_numpy(np.uint16(val), backend=x.backend) + + def to_u32_const(val: int) -> Tensor: + return fns.from_numpy(np.uint32(val), backend=x.backend) + + x = x.view(TensorDataType.uint16) + + u16_zero = to_u16_const(0) + u32_zero = to_u32_const(0) + # f16 layout - f16_s_mask = np.uint16(0x8000) - f16_e_mask = np.uint16(0x7C00) + f16_s_mask = to_u16_const(0x8000) + f16_e_mask = to_u16_const(0x7C00) f16_e_bias = 15 f16_e_size = 5 - f16_m_mask = np.uint16(0x03FF) + f16_m_mask = to_u16_const(0x03FF) f16_m_size = 10 # f8 e4m3 layout f8e4m3_e_size = 4 - f8e4m3_e_mask = np.uint16(0x78) + f8e4m3_e_mask = to_u16_const(0x78) f8e4m3_e_bias = 7 f8e4m3_e_max = 0x0F f8e4m3_m_size = 3 - f8e4m3_m_mask = np.uint16(0x07) + f8e4m3_m_mask = to_u16_const(0x07) byte_shift = 8 # f8 masks in uint16 domain - f8_e_mask = np.uint16(f8e4m3_e_mask << byte_shift) # 0x7800 - f8_m_mask = np.uint16(f8e4m3_m_mask << byte_shift) # 0x0700 - f8_m_hidden_one_mask = np.uint16(0x0800) # hidden 1 for subnormals + f8_e_mask = (f8e4m3_e_mask << byte_shift).astype(TensorDataType.uint16) # 0x7800 + f8_m_mask = (f8e4m3_m_mask << byte_shift).astype(TensorDataType.uint16) # 0x0700 + f8_m_hidden_one_mask = to_u16_const(0x0800) # hidden 1 for subnormals # rounding constants - round_half = np.uint16(0x01FF) - round_norm = np.uint16(0x007F) - round_even = np.uint16(0x0080) - round_odd = np.uint16(0x0180) + round_half = to_u16_const(0x01FF) + round_norm = to_u16_const(0x007F) + round_even = to_u16_const(0x0080) + round_odd = to_u16_const(0x0180) # min exponent for which subnormals are representable f8_e_subnormal_min = -10 # sign bit: f16 sign -> f8 sign position (bit 15 -> bit 7) - f8_bits = ((x & f16_s_mask) >> byte_shift).astype(np.uint16) + f8_bits = ((x & f16_s_mask) >> byte_shift).astype(TensorDataType.uint16) f16_e_field = x & f16_e_mask is_naninf = f16_e_field == f16_e_mask - is_zero = f16_e_field == 0 + is_zero = f16_e_field == u16_zero is_normal = (~is_naninf) & (~is_zero) - nan_pattern = np.uint16(f8e4m3_e_mask | f8e4m3_m_mask) + nan_pattern = (f8e4m3_e_mask | f8e4m3_m_mask).astype(TensorDataType.uint16) # --- Case 1: f16 NaN / Inf -> f8 NaN (no Inf) --- - f8_bits = np.where(is_naninf, f8_bits | nan_pattern, f8_bits) + f8_bits = fns.where(is_naninf, f8_bits | nan_pattern, f8_bits) # --- Case 2: normalized f16 --- # f8_biased_exp = (f16_e_field >> f16_m_size) - (f16_e_bias - f8e4m3_e_bias) - f8_biased_exp = (f16_e_field >> f16_m_size).astype(np.int32) - (f16_e_bias - f8e4m3_e_bias) + f8_biased_exp = (f16_e_field >> f16_m_size).astype(TensorDataType.int32) - (f16_e_bias - f8e4m3_e_bias) # fractional = (inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size) - fractional_norm = ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)).astype(np.uint16) + fractional_norm = ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)).astype(TensorDataType.uint16) exp_ge0 = (f8_biased_exp >= 0) & is_normal # Rounding for normalized part (exp >= 0) # if (fractional & round_half) == round_odd or (fractional & round_norm) != 0: - cond_round_norm = (((fractional_norm & round_half) == round_odd) | ((fractional_norm & round_norm) != 0)) & exp_ge0 + cond_round_norm = ( + ((fractional_norm & round_half) == round_odd) | + ((fractional_norm & round_norm) != 0) + ) & exp_ge0 # fractional += round_even where cond_round_norm - frac_tmp = fractional_norm.astype(np.uint32) + np.where(cond_round_norm, round_even, np.uint16(0)).astype(np.uint32) - fractional_norm = (frac_tmp & 0xFFFF).astype(np.uint16) + frac_tmp = fractional_norm.astype(TensorDataType.uint32) + \ + fns.where(cond_round_norm, round_even, u16_zero).astype(TensorDataType.uint32) + fractional_norm = (frac_tmp & 0xFFFF).astype(TensorDataType.uint16) # if (fractional & f8_e_mask) != 0: f8_biased_exp += 1 - exp_inc = np.where(exp_ge0 & ((fractional_norm & f8_e_mask) != 0), 1, 0).astype(np.int32) + exp_inc = fns.where(exp_ge0 & ((fractional_norm & f8_e_mask) != 0), 1, 0).astype(TensorDataType.int32) f8_biased_exp_after = f8_biased_exp + exp_inc # fractional &= f8_m_mask @@ -119,22 +140,22 @@ def f16_to_f8e4m3_bits_numpy(x: np.ndarray) -> np.ndarray: subnormal_mask = is_normal & (f8_biased_exp_after <= 0) & (~overflow_mask) # --- Overflow -> NaN --- - f8_bits = np.where(overflow_mask, f8_bits | nan_pattern, f8_bits) + f8_bits = fns.where(overflow_mask, f8_bits | nan_pattern, f8_bits) # --- Normalized f8 --- # exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size - exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype(np.uint16) - mant_norm = (fractional_norm >> byte_shift).astype(np.uint16) + exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype(TensorDataType.uint16) + mant_norm = (fractional_norm >> byte_shift).astype(TensorDataType.uint16) f8_bits_norm = f8_bits | exp_field | mant_norm - f8_bits = np.where(normal_mask, f8_bits_norm, f8_bits) + f8_bits = fns.where(normal_mask, f8_bits_norm, f8_bits) # --- Subnormal f8 --- # fractional = f8_m_hidden_one_mask | ((inp & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) fractional_sub = f8_m_hidden_one_mask | ((x & f16_m_mask) << (f16_e_size - f8e4m3_e_size)) # f8_exp = f8_biased_exp - f8e4m3_e_bias - f8_exp = (f8_biased_exp_after - f8e4m3_e_bias).astype(np.int32) + f8_exp = (f8_biased_exp_after - f8e4m3_e_bias).astype(TensorDataType.int32) # shift = 1 - f8_exp shift = 1 - f8_exp @@ -142,54 +163,67 @@ def f16_to_f8e4m3_bits_numpy(x: np.ndarray) -> np.ndarray: # sticky_mask = 0 if f8_exp < f8_e_subnormal_min else ((1 << shift) - 1) # we avoid invalid shifts by clipping / masking valid_sub = f8_exp >= f8_e_subnormal_min - shift_pos = np.maximum(shift, 0) - sticky_mask32 = np.where(valid_sub, (np.uint32(1) << shift_pos) - 1, 0).astype(np.uint32) - sticky_mask16 = (sticky_mask32 & np.uint32(0xFFFF)).astype(np.uint16) + shift_pos = fns.maximum(shift, 0) + + one_u32 = to_u32_const(1) + mask_u32_full = to_u32_const(0xFFFF) + + sticky_mask32 = fns.where( + valid_sub, + (one_u32 << shift_pos) - 1, + u32_zero, + ).astype(TensorDataType.uint32) + sticky_mask16 = (sticky_mask32 & mask_u32_full).astype(TensorDataType.uint16) # sticky = 1 if (fractional & sticky_mask) != 0 else 0 sticky = ((fractional_sub & sticky_mask16) != 0) & valid_sub # fractional = 0 if f8_exp < f8_e_subnormal_min else (fractional >> (1 - f8_biased_exp)) shift2 = 1 - f8_biased_exp_after - shift2_pos = np.maximum(shift2, 0) - frac_shifted = (fractional_sub.astype(np.uint32) >> shift2_pos).astype(np.uint16) - frac_shifted = np.where(valid_sub, frac_shifted, np.uint16(0)) + shift2_pos = fns.maximum(shift2, 0) + frac_shifted = (fractional_sub.astype(TensorDataType.uint32) >> shift2_pos).astype(TensorDataType.uint16) + frac_shifted = fns.where(valid_sub, frac_shifted, u16_zero) # Rounding for subnormal: # if (((fractional & round_half) == round_odd and sticky == 0) # or (fractional & round_norm) != 0 # or sticky != 0): cond_round_sub = ( - (((frac_shifted & round_half) == round_odd) & (~sticky)) | ((frac_shifted & round_norm) != 0) | sticky + (((frac_shifted & round_half) == round_odd) & (~sticky)) | + ((frac_shifted & round_norm) != 0) | + sticky ) & subnormal_mask - frac_tmp_sub = frac_shifted.astype(np.uint32) + np.where(cond_round_sub, round_even, np.uint16(0)).astype(np.uint32) - fractional_sub_final = (frac_tmp_sub & 0xFFFF).astype(np.uint16) + frac_tmp_sub = frac_shifted.astype(TensorDataType.uint32) + \ + fns.where(cond_round_sub, round_even, u16_zero).astype(TensorDataType.uint32) + fractional_sub_final = (frac_tmp_sub & 0xFFFF).astype(TensorDataType.uint16) - mant_sub = (fractional_sub_final >> byte_shift).astype(np.uint16) - f8_bits = np.where(subnormal_mask, f8_bits | mant_sub, f8_bits) + mant_sub = (fractional_sub_final >> byte_shift).astype(TensorDataType.uint16) + f8_bits = fns.where(subnormal_mask, f8_bits | mant_sub, f8_bits) # Case: f16 zero / subnormal -> sign + zero exponent/mantissa # Already handled by initialization + not touching zero_mask entries. - return (f8_bits & np.uint16(0x00FF)).astype(np.uint8) + return (f8_bits & to_u16_const(0x00FF)).astype(TensorDataType.uint8) -def fp32_to_fp8e4m3(x: np.ndarray) -> np.ndarray: - """ - Bit-exact to ov::float8_e4m3(float): - float32 -> float16 -> f8e4m3 bits -> float via LUT - """ - x = np.asarray(x, dtype=np.float32) - x_f16 = x.astype(np.float16) - h_bits = x_f16.view(np.uint16) - f8_bits = f16_to_f8e4m3_bits_numpy(h_bits) - # Decode exactly like C++: LUT for magnitude + sign bit - idx = f8_bits & 0x7F - mag = F8E4M3_LUT[idx.astype(np.int32)] +def fp32_to_fp8e4m3(x: Tensor) -> Tensor: + """ + Convert float32 to float8 e4m3 via float16. + Adopted from OpenVINO C++ implementation + https://github.com/openvinotoolkit/openvino/blame/master/src/core/src/type/float8_e4m3.cpp - sign = np.where((f8_bits & 0x80) != 0, -1.0, 1.0) - out = sign * mag - return out.astype(np.float32) + :param x: Input tensor with float32 values. + :return: Tensor with float8 e4m3 values as float32 type. + """ + x_f16 = x.astype(TensorDataType.float16) + f8_bits = _f16_to_f8e4m3_bits_numpy(x_f16) + + indexes = f8_bits & 0x7F + look_up_table = fns.from_numpy(F8E4M3_LUT, backend=x.backend) + magnitude = look_up_table[indexes.astype(TensorDataType.int32)] + sign = fns.where((f8_bits & 0x80) != 0, -1.0, 1.0) + result = sign * magnitude + return result.astype(TensorDataType.float32) diff --git a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 50483e20690..af8c8034760 100644 --- a/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/src/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -225,10 +225,11 @@ def _create_compression_subgraph( precomputed_compressed_weight: Optional[CompressedWeight] = None, ): compression_dtype = DTYPE_MAP[compression_config.compression_dtype] - if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]: - scale_dtype = ov.Type.f8e8m0 - else: - scale_dtype = ov.Type.f16 + scale_dtype = ( + ov.Type.f8e8m0 + if compression_config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3] + else ov.Type.f16 + ) original_shape = weight.shape @@ -241,7 +242,6 @@ def _create_compression_subgraph( ) if compression_config.is_codebook: - compression_dtype = DTYPE_MAP[compression_config.compression_dtype] converted_const = create_ov_codebook_subgraph( codebook=compressed_weight.codebook if compression_config.mode == CompressWeightsMode.CODEBOOK diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 219c364555e..349729c5c13 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -522,8 +522,7 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, compression_dtype: Te if compression_dtype == TensorDataType.f8e4m3: from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3 - quantiles_np = fp32_to_fp8e4m3(norm_weight.as_numpy_tensor().data) - return fns.from_numpy(quantiles_np, backend=norm_weight.backend) + return fp32_to_fp8e4m3(norm_weight) is_nf4 = compression_dtype == TensorDataType.nf4 quantiles_np = NF4_QUANTILES if is_nf4 else F4E2M1_QUANTILES diff --git a/src/nncf/tensor/definitions.py b/src/nncf/tensor/definitions.py index 29c89d0b473..a3c09bda165 100644 --- a/src/nncf/tensor/definitions.py +++ b/src/nncf/tensor/definitions.py @@ -50,6 +50,7 @@ class TensorDataType(StrEnum): int32 = auto() int64 = auto() uint16 = auto() + uint32 = auto() uint8 = auto() uint4 = auto() int4 = auto() @@ -83,6 +84,7 @@ def itemsize(self) -> int: TensorDataType.int8: 8, TensorDataType.uint8: 8, TensorDataType.uint16: 16, + TensorDataType.uint32: 32, TensorDataType.float16: 16, TensorDataType.bfloat16: 16, TensorDataType.float32: 32, diff --git a/src/nncf/tensor/functions/numeric.py b/src/nncf/tensor/functions/numeric.py index 6dd5b2bf706..f67755de4d8 100644 --- a/src/nncf/tensor/functions/numeric.py +++ b/src/nncf/tensor/functions/numeric.py @@ -137,6 +137,17 @@ def astype(a: Tensor, dtype: TensorDataType) -> Tensor: """ +@tensor_dispatcher +def view(a: Tensor, dtype: TensorDataType) -> Tensor: + """ + Returns a view of the tensor with the specified data type. + + :param a: The input tensor. + :param dtype: The desired data + :return: A view of the tensor with the specified data type. + """ + + @tensor_dispatcher def dtype(a: Tensor) -> TensorDataType: """ diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index ae62c40edee..0dab988e597 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -27,7 +27,7 @@ from nncf.tensor.tensor import TTensor T_NUMPY_ARRAY = NDArray[Any] -T_NUMPY = Union[T_NUMPY_ARRAY, np.generic] # type: ignore[type-arg] +T_NUMPY = Union[T_NUMPY_ARRAY, np.generic] DTYPE_MAP: dict[TensorDataType, DTypeLike] = { TensorDataType.float16: np.dtype(np.float16), @@ -38,6 +38,7 @@ TensorDataType.int64: np.dtype(np.int64), TensorDataType.uint8: np.dtype(np.uint8), TensorDataType.uint16: np.dtype(np.uint16), + TensorDataType.uint32: np.dtype(np.uint32), } DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()} @@ -98,6 +99,11 @@ def _(a: T_NUMPY, dtype: TensorDataType) -> T_NUMPY: return a.astype(DTYPE_MAP[dtype]) +@numeric.view.register +def _(a: T_NUMPY, dtype: TensorDataType) -> T_NUMPY: + return a.view(DTYPE_MAP[dtype]) + + @numeric.dtype.register def _(a: T_NUMPY) -> TensorDataType: return DTYPE_MAP_REV[np.dtype(a.dtype)] diff --git a/src/nncf/tensor/functions/openvino_numeric.py b/src/nncf/tensor/functions/openvino_numeric.py index 8794cdb6a68..2fd8fc79c1a 100644 --- a/src/nncf/tensor/functions/openvino_numeric.py +++ b/src/nncf/tensor/functions/openvino_numeric.py @@ -35,6 +35,7 @@ TensorDataType.int32: ov.Type.i32, TensorDataType.int64: ov.Type.i64, TensorDataType.uint16: ov.Type.u16, + TensorDataType.uint32: ov.Type.u32, TensorDataType.uint8: ov.Type.u8, TensorDataType.uint4: ov.Type.u4, TensorDataType.int4: ov.Type.i4, diff --git a/src/nncf/tensor/functions/torch_numeric.py b/src/nncf/tensor/functions/torch_numeric.py index 7390d36178b..c32fbd49832 100644 --- a/src/nncf/tensor/functions/torch_numeric.py +++ b/src/nncf/tensor/functions/torch_numeric.py @@ -36,6 +36,7 @@ TensorDataType.int64: torch.int64, TensorDataType.uint8: torch.uint8, TensorDataType.uint16: torch.uint16, + TensorDataType.uint32: torch.uint32, } DEVICE_MAP = {TensorDeviceType.CPU: "cpu", TensorDeviceType.GPU: "cuda"} @@ -109,6 +110,11 @@ def _(a: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: return a.type(DTYPE_MAP[dtype]) +@numeric.view.register +def _(a: torch.Tensor, dtype: TensorDataType) -> torch.Tensor: + return a.view(DTYPE_MAP[dtype]) + + @numeric.dtype.register def _(a: torch.Tensor) -> TensorDataType: return DTYPE_MAP_REV[a.dtype] diff --git a/src/nncf/tensor/tensor.py b/src/nncf/tensor/tensor.py index 2a376a6cd30..7407d294725 100644 --- a/src/nncf/tensor/tensor.py +++ b/src/nncf/tensor/tensor.py @@ -159,6 +159,15 @@ def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: def __neg__(self) -> Tensor: return Tensor(-self.data) + def __invert__(self): + return Tensor(~self.data) + + def __rshift__(self, other: T_NUMBER): + return Tensor(self.data >> unwrap_tensor_data(other)) + + def __lshift__(self, other: T_NUMBER): + return Tensor(self.data << unwrap_tensor_data(other)) + # Comparison operators def __lt__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: @@ -202,6 +211,9 @@ def isempty(self) -> bool: def astype(self, dtype: TensorDataType) -> Tensor: return cast(Tensor, _call_function("astype", self, dtype)) + def view(self, dtype: TensorDataType) -> Tensor: + return cast(Tensor, _call_function("view", self, dtype)) + def reshape(self, shape: T_SHAPE) -> Tensor: return cast(Tensor, _call_function("reshape", self, shape)) diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py index 2280dc8d9a9..026f1777459 100644 --- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py +++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py @@ -48,7 +48,13 @@ } BINARY_OPERATORS = ["add", "sub", "pow", "mul", "truediv", "floordiv"] -BOOLEAN_OPERATOR_MAP = {"and": operator.and_, "or": operator.or_} +BOOLEAN_OPERATOR_MAP = { + "and": operator.and_, + "or": operator.or_, + "rshift": operator.rshift, + "lshift": operator.lshift, + "invert": lambda a, _: ~a, +} COMPARISON_OPERATOR_MAP = { "lt": operator.lt, @@ -101,9 +107,9 @@ def test_operator_clone(self): assert all(tensor_a == tensor_b) @pytest.mark.parametrize("op_name", BOOLEAN_OPERATOR_MAP.keys()) - @pytest.mark.parametrize("value", [True, False]) + @pytest.mark.parametrize("value", [0, 1, 2]) def test_operators_bool(self, op_name, value): - tensor_a = self.to_tensor([True, False]) + tensor_a = self.to_tensor([0, 1, 2]) nncf_tensor_a = Tensor(tensor_a) @@ -655,6 +661,15 @@ def test_fn_astype(self): assert isinstance(res, Tensor) assert res.dtype == TensorDataType.int8 + def test_view(self): + tensor = Tensor(self.to_tensor([1.5])) + res = tensor.view(TensorDataType.uint8) + res_back = res.view(tensor.dtype) + assert isinstance(res, Tensor) + assert res.dtype == TensorDataType.uint8 + assert res_back.dtype == tensor.dtype + assert fns.allclose(res_back, tensor) + def test_atleast_1d(self): scalar = Tensor(self.to_tensor(42)) assert fns.atleast_1d(scalar).shape == (1,) @@ -2139,6 +2154,7 @@ def test_fn_eye(self, n, m, ref): TensorDataType.int4, TensorDataType.uint4, TensorDataType.uint16, + TensorDataType.uint32, TensorDataType.nf4, TensorDataType.f4e2m1, TensorDataType.f8e8m0, diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 4d4a0555368..9566bd4e9e6 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -2067,6 +2067,84 @@ def test_mxfp4_quantization_edge_cases(input_val, expected_val, description): ) +@pytest.mark.parametrize( + "input_val,expected_val,description", + [ + # --- Zeros --- + (0.0, 0.0, "Positive zero should stay 0.0"), + (-0.0, 0.0, "Negative zero should quantize to +0.0 (LUT[0])"), + # --- Small subnormals & underflow (based on LUT[0..15]) --- + # LUT[1] = 0.001953125 + (0.0005, 0.0, "Too small magnitude should underflow to 0.0"), + (0.001, 0.001953125, "Small positive should become smallest positive subnormal (LUT[1])"), + (-0.001, -0.001953125, "Negative small should become smallest negative subnormal (-LUT[1])"), + # A few more subnormal points (LUT[2] and LUT[4]) + # LUT[2] = 0.00390625, LUT[4] = 0.0078125 + (0.003, 0.00390625, "Should round up to subnormal 0.00390625 (LUT[2])"), + (0.006, 0.005859375, "Should round to subnormal 0.005859375 (LUT[3])"), + (-0.006, -0.005859375, "Negative should round to -0.005859375 (LUT[3])"), + # --- Around the transition into 'larger' subnormals / small normals --- + # LUT[16] = 0.03125 + (0.03125, 0.03125, "0.03125 exactly representable (LUT[16])"), + (0.030, 0.029296875, "0.030 should round to 0.029296875 (LUT[15])"), + (-0.030, -0.029296875, "Negative rounding around -0.029296875"), + # --- Normal range values (taken directly from LUT for guaranteed exactness) --- + # From LUT around 0.0625..0.25 + (0.0625, 0.0625, "0.0625 exactly representable (LUT[24])"), + (0.0703125, 0.0703125, "0.0703125 exactly representable (LUT[25])"), + (0.078125, 0.078125, "0.078125 exactly representable (LUT[26])"), + (0.109375, 0.109375, "0.109375 exactly representable (LUT[30])"), + (0.125, 0.125, "0.125 exactly representable (LUT[32])"), + (0.25, 0.25, "0.25 exactly representable (LUT[40])"), + # A couple of midpoints to test rounding-to-nearest-even-ish behavior + (0.26, 0.25, "0.26 closer to 0.25 than 0.28125 – should round to 0.25"), + (0.28, 0.28125, "0.28 closer to 0.28125 (LUT[41]) – should round up"), + # --- Symmetry around zero for normals --- + (0.5, 0.5, "0.5 exactly representable (LUT[48])"), + (-0.5, -0.5, "-0.5 exactly representable"), + (1.0, 1.0, "1.0 exactly representable (LUT[56])"), + (-1.0, -1.0, "-1.0 exactly representable"), + (1.75, 1.75, "1.75 exactly representable (LUT[62])"), + (-1.75, -1.75, "-1.75 exactly representable"), + # --- Values in the 'integer-like' region --- + (2.0, 2.0, "2.0 exactly representable (LUT[64])"), + (3.0, 3.0, "3.0 exactly representable (LUT[68])"), + (4.0, 4.0, "4.0 exactly representable (LUT[72])"), + (5.0, 5.0, "5.0 exactly representable (LUT[74])"), + (6.0, 6.0, "6.0 exactly representable (LUT[76])"), + (7.0, 7.0, "7.0 exactly representable (LUT[78])"), + (8.0, 8.0, "8.0 exactly representable (LUT[80])"), + (-8.0, -8.0, "-8.0 exactly representable"), + # --- Larger finite values near high end of LUT --- + (16.0, 16.0, "16.0 exactly representable (LUT[88])"), + (32.0, 32.0, "32.0 exactly representable (LUT[96])"), + (64.0, 64.0, "64.0 exactly representable (LUT[104])"), + (128.0, 128.0, "128.0 exactly representable (LUT[112])"), + (256.0, 256.0, "256.0 exactly representable (LUT[120])"), + (448.0, 448.0, "448.0 exactly representable (LUT[126], max finite)"), + # --- Rounding near the max finite value --- + (400.0, 384.0, "400.0 should round to the nearest representable (LUT[116] = 384.0)"), + (460.0, 448.0, "460.0 should round to max finite 448.0 (LUT[126])"), + # --- Overflow / NaN / Inf handling --- + (500.0, np.nan, "Above max finite range, should overflow to NaN"), + (1e4, np.nan, "Way above max finite range, should overflow to NaN"), + (np.inf, np.nan, "+inf should map to NaN (no Inf representation)"), + (-np.inf, np.nan, "-inf should map to NaN (no Inf representation)"), + (np.nan, np.nan, "NaN input should remain NaN after quantization/dequantization"), + ], +) +def test_f8e4m3_quantization_edge_cases(input_val, expected_val, description): + norm_weight = Tensor(np.array([input_val], dtype=np.float32)) + result = _calculate_float_quantized_weight(norm_weight, TensorDataType.f8e4m3) + + out = result.data[0] + + if isinstance(expected_val, float) and np.isnan(expected_val): + assert np.isnan(out), f"{description}: Expected NaN, got {out} for input value {input_val}" + else: + assert out == expected_val, f"{description}: Expected {expected_val}, got {out} for input value {input_val}" + + @pytest.mark.parametrize( "codebook", [ From d572b7892ebd6b3ff5d869c481de0347f4e8ba89 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 28 Nov 2025 13:13:42 +0100 Subject: [PATCH 5/5] Style --- .../weight_compression/fp8_conversion.py | 33 +++++++++---------- .../weight_compression/weight_lowering.py | 3 +- src/nncf/tensor/functions/numpy_numeric.py | 2 +- src/nncf/tensor/tensor.py | 6 ++-- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py index 2b8fc7efa79..a43b842f28f 100644 --- a/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py +++ b/src/nncf/quantization/algorithms/weight_compression/fp8_conversion.py @@ -9,10 +9,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nncf.tensor import Tensor, TensorDataType -from nncf.tensor import functions as fns import numpy as np +from nncf.tensor import Tensor +from nncf.tensor import TensorDataType +from nncf.tensor import functions as fns + # fmt: off F8E4M3_LUT = np.array( [ @@ -86,7 +88,7 @@ def to_u32_const(val: int) -> Tensor: round_half = to_u16_const(0x01FF) round_norm = to_u16_const(0x007F) round_even = to_u16_const(0x0080) - round_odd = to_u16_const(0x0180) + round_odd = to_u16_const(0x0180) # min exponent for which subnormals are representable f8_e_subnormal_min = -10 @@ -115,14 +117,12 @@ def to_u32_const(val: int) -> Tensor: # Rounding for normalized part (exp >= 0) # if (fractional & round_half) == round_odd or (fractional & round_norm) != 0: - cond_round_norm = ( - ((fractional_norm & round_half) == round_odd) | - ((fractional_norm & round_norm) != 0) - ) & exp_ge0 + cond_round_norm = (((fractional_norm & round_half) == round_odd) | ((fractional_norm & round_norm) != 0)) & exp_ge0 # fractional += round_even where cond_round_norm - frac_tmp = fractional_norm.astype(TensorDataType.uint32) + \ - fns.where(cond_round_norm, round_even, u16_zero).astype(TensorDataType.uint32) + frac_tmp = fractional_norm.astype(TensorDataType.uint32) + fns.where(cond_round_norm, round_even, u16_zero).astype( + TensorDataType.uint32 + ) fractional_norm = (frac_tmp & 0xFFFF).astype(TensorDataType.uint16) # if (fractional & f8_e_mask) != 0: f8_biased_exp += 1 @@ -144,7 +144,9 @@ def to_u32_const(val: int) -> Tensor: # --- Normalized f8 --- # exp_field = (f8_biased_exp & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size - exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype(TensorDataType.uint16) + exp_field = ((f8_biased_exp_after & (f8e4m3_e_mask >> f8e4m3_m_size)) << f8e4m3_m_size).astype( + TensorDataType.uint16 + ) mant_norm = (fractional_norm >> byte_shift).astype(TensorDataType.uint16) f8_bits_norm = f8_bits | exp_field | mant_norm @@ -189,13 +191,12 @@ def to_u32_const(val: int) -> Tensor: # or (fractional & round_norm) != 0 # or sticky != 0): cond_round_sub = ( - (((frac_shifted & round_half) == round_odd) & (~sticky)) | - ((frac_shifted & round_norm) != 0) | - sticky + (((frac_shifted & round_half) == round_odd) & (~sticky)) | ((frac_shifted & round_norm) != 0) | sticky ) & subnormal_mask - frac_tmp_sub = frac_shifted.astype(TensorDataType.uint32) + \ - fns.where(cond_round_sub, round_even, u16_zero).astype(TensorDataType.uint32) + frac_tmp_sub = frac_shifted.astype(TensorDataType.uint32) + fns.where(cond_round_sub, round_even, u16_zero).astype( + TensorDataType.uint32 + ) fractional_sub_final = (frac_tmp_sub & 0xFFFF).astype(TensorDataType.uint16) mant_sub = (fractional_sub_final >> byte_shift).astype(TensorDataType.uint16) @@ -207,8 +208,6 @@ def to_u32_const(val: int) -> Tensor: return (f8_bits & to_u16_const(0x00FF)).astype(TensorDataType.uint8) - - def fp32_to_fp8e4m3(x: Tensor) -> Tensor: """ Convert float32 to float8 e4m3 via float16. diff --git a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 349729c5c13..abd8dea1b0d 100644 --- a/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/src/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -24,6 +24,7 @@ from nncf.quantization.algorithms.weight_compression.constants import F4E2M1_QUANTILES from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES +from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3 from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight from nncf.quantization.fake_quantize import calculate_scale_zero_point from nncf.tensor import Tensor @@ -520,8 +521,6 @@ def _calculate_float_quantized_weight(norm_weight: Tensor, compression_dtype: Te assert compression_dtype in [TensorDataType.f8e4m3, TensorDataType.f4e2m1, TensorDataType.nf4] if compression_dtype == TensorDataType.f8e4m3: - from nncf.quantization.algorithms.weight_compression.fp8_conversion import fp32_to_fp8e4m3 - return fp32_to_fp8e4m3(norm_weight) is_nf4 = compression_dtype == TensorDataType.nf4 diff --git a/src/nncf/tensor/functions/numpy_numeric.py b/src/nncf/tensor/functions/numpy_numeric.py index 0dab988e597..6e993b0f9ff 100644 --- a/src/nncf/tensor/functions/numpy_numeric.py +++ b/src/nncf/tensor/functions/numpy_numeric.py @@ -27,7 +27,7 @@ from nncf.tensor.tensor import TTensor T_NUMPY_ARRAY = NDArray[Any] -T_NUMPY = Union[T_NUMPY_ARRAY, np.generic] +T_NUMPY = Union[T_NUMPY_ARRAY, np.generic] # type: ignore[type-arg] DTYPE_MAP: dict[TensorDataType, DTypeLike] = { TensorDataType.float16: np.dtype(np.float16), diff --git a/src/nncf/tensor/tensor.py b/src/nncf/tensor/tensor.py index 7407d294725..b00bf99cf7e 100644 --- a/src/nncf/tensor/tensor.py +++ b/src/nncf/tensor/tensor.py @@ -159,13 +159,13 @@ def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor: def __neg__(self) -> Tensor: return Tensor(-self.data) - def __invert__(self): + def __invert__(self) -> Tensor: return Tensor(~self.data) - def __rshift__(self, other: T_NUMBER): + def __rshift__(self, other: T_NUMBER) -> Tensor: return Tensor(self.data >> unwrap_tensor_data(other)) - def __lshift__(self, other: T_NUMBER): + def __lshift__(self, other: T_NUMBER) -> Tensor: return Tensor(self.data << unwrap_tensor_data(other)) # Comparison operators