openvinotoolkit · nikita-savelyevv · Jun 17, 2025 · Jun 23, 2025 · Jul 22, 2025 · Jul 29, 2025
@@ -44,6 +44,8 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype:
         type_name = ov_type.get_type_name()
         conversion_map = {
             "nf4": "float",
+            "f4e2m1": "float",
+            "f8e8m0": "float",
             "f8e4m3": "float",
             "f8e5m2": "float",
             "f16": "float",

@@ -107,17 +107,16 @@ def do_float_quantization(
     precomputed_scale: Optional[Tensor] = None,
 ) -> tuple[Tensor, Tensor, Tensor]:
     """
-    Computes quantization scale if not provided, and performs corresponding nf4 weight quantization.
-    For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
-    TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+    Computes quantization scale if not provided, and performs corresponding float weight quantization.
+    NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6].
 
     :param weight: Weight array to compress.
     :param config: Weight compression configuration.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
-    :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor.
+    :return: Returns quantized weight tensor and corresponding scale tensor.
     """
-    assert config.mode == CompressWeightsMode.NF4
+    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]
 
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
@@ -129,7 +128,8 @@ def do_float_quantization(
     if weight.backend == TensorBackend.ov:
         # Return ov tensors in target precision to seamlessly insert them into openvino model later
         ov_model_params.return_ov_tensors = True
-        ov_model_params.output_dtypes.update({"compressed_weight": TensorDataType.nf4})
+        weight_dtype = TensorDataType.f4e2m1 if config.mode == CompressWeightsMode.MXFP4 else TensorDataType.nf4
+        ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})
 
     model = get_float_quantization_model(
         ov_model_params,
@@ -235,7 +235,7 @@ def float_quantize_dequantize_weight(
     :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
     """
-    assert config.mode == CompressWeightsMode.NF4
+    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]
 
     # When reduction axes are not provided, assuming that the weights are already reshaped
     if config.group_size != -1 and reduction_axes is not None:

@@ -286,7 +286,8 @@ def get_float_quantization_model(
     reduction_axes: Optional[ReductionAxes] = None,
 ) -> Union[ModelCallable, ModelAsNodes]:
     """
-    Get a model that compresses weights to float (currently only nf4) destination type using the given configuration.
+    Get a model that compresses weights to float (currently nf4 or mxfp4) destination type using the given
+    configuration.
 
     :param ov_model_params: OV model parameters.
     :param config: Compression configuration.
@@ -571,7 +572,7 @@ def _build_float_quantization_model(
     reduction_axes: Optional[ReductionAxes] = None,
     return_nodes: bool = False,
 ) -> Union[ModelCallable, ModelAsNodes]:
-    assert config.mode == CompressWeightsMode.NF4
+    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]
 
     default_input_dtypes = {"scale": TensorDataType.float32}
     default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
@@ -597,8 +598,7 @@ def _build_float_quantization_model(
     )
 
     # Validate output dtypes
-    # TODO: add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
-    valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4]
+    valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
     if compressed_weight_dtype not in valid_compressed_weight_dtypes:
         msg = (
             f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
@@ -626,8 +626,16 @@ def _build_float_quantization_model(
         eps = np.finfo(np.float32).eps
         scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)
 
+        if config.mode == CompressWeightsMode.MXFP4:
+            scale = scale / opset.constant(6.0, ov.Type.f32)
+            scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
+            scale = opset.ceil(scale)
+            scale = opset.clamp(scale, -127.0, 127.0)
+            scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)
+
     compressed_weight = divide_op(weight, scale)
-    compressed_weight = convert_op(compressed_weight, ov.Type.nf4)
+    target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
+    compressed_weight = convert_op(compressed_weight, target_dtype)
     compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
 
     ov_results = [compressed_weight]

@@ -33,6 +33,27 @@
     dtype=np.float32,
 )
 
+MXFP4_QUANTILES = np.array(
+    [
+        -6.0,
+        -4.0,
+        -3.0,
+        -2.0,
+        -1.5,
+        -1.0,
+        -0.5,
+        -0.0,
+        0.5,
+        1.0,
+        1.5,
+        2.0,
+        3.0,
+        4.0,
+        6.0,
+    ],
+    dtype=np.float32,
+)
+
 
 CB4_QUANTILES = np.array(
     [
@@ -77,3 +98,6 @@
     ],
     dtype=np.float32,
 )
+
+
+CENTER_OF_MXFP4_QUANTILES = (MXFP4_QUANTILES[1:] + MXFP4_QUANTILES[:-1]) / 2
@@ -19,7 +19,9 @@
 from nncf.errors import UnsupportedModelError
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_MXFP4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.constants import CENTER_OF_NF4_QUANTILES
+from nncf.quantization.algorithms.weight_compression.constants import MXFP4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.constants import NF4_QUANTILES
 from nncf.quantization.algorithms.weight_compression.parameters import CompressedWeight
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
@@ -147,15 +149,15 @@ def do_float_quantization(
     """
     Computes quantization scale if not provided,
     and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
-    For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
-    For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
-    TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+    NF4 format uses 16 levels in [-1, 1] range, while MXFP4 uses 16 levels in [-6, 6].
+    For MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
+    For CODEBOOK currently returns normalized weight without quantization.
 
     :param weight: Weight array to compress.
     :param config: Weight compression configuration.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
-    :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
+    :return: Returns quantized (for MXFP8_E4M3 and codebook normalized) weight tensor and corresponding scale tensor and
              optional indexes for codebook.
     """
     assert not config.is_integer
@@ -165,7 +167,7 @@ def do_float_quantization(
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
     # Optimized implementation
-    if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
+    if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov
 
         return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale)
@@ -180,19 +182,19 @@ def do_float_quantization(
     if scale is None:
         scale = calculate_float_quantization_params(weight, reduction_axes, config)
     norm_weight = _calculate_normalized_weight(weight, scale)
-    if config.mode == CompressWeightsMode.NF4:
+    if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]:
         if original_weight_backend == TensorBackend.ov:
-            # Can convert through OpenVINO and return OpenVINO-native NF4 tensor
-            compressed_weight = norm_weight.as_openvino_tensor().astype(TensorDataType.nf4)
+            # Can convert through OpenVINO and return OpenVINO-native nf4/f4e2m1 tensor
+            target_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
+            compressed_weight = norm_weight.as_openvino_tensor().astype(target_dtype)
         else:
-            compressed_weight = _calculate_nf4_quantized_weight(norm_weight)
+            compressed_weight = _calculate_float_quantized_weight(norm_weight, config.mode)
     elif config.is_codebook:
         compressed_weight, indexes = _calculate_codebook_quantized_weight(
             norm_weight, quantiles=config.get_numpy_codebook()
         )
         return compressed_weight, scale, indexes
     else:
-        # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
         compressed_weight = norm_weight
     return compressed_weight, scale, None
 
@@ -205,8 +207,8 @@ def float_quantize_dequantize_weight(
     return_compressed_weight: Optional[bool] = False,
 ) -> Union[Tensor, tuple[Tensor, Tensor, Tensor]]:
     """
-    First quantizes the given weight tensor to float (nf4) dtype and then dequantizes it back to obtain float32 values.
-    MXFP4 and MXFP8_E4M3 mode is currently not supported.
+    First quantizes the given weight tensor to float dtype and then dequantizes it back to obtain float32 values.
+    MXFP8_E4M3 mode is currently not supported.
 
     :param weight: The weight tensor to quantize-dequantize.
     :param config: Compression configuration.
@@ -215,11 +217,15 @@ def float_quantize_dequantize_weight(
     :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
-    # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3, once ticket 164851 is resolved
+    assert config.mode in [
+        CompressWeightsMode.NF4,
+        CompressWeightsMode.MXFP4,
+        CompressWeightsMode.CODEBOOK,
+        CompressWeightsMode.CB4_F8E4M3,
+    ]
 
     # Optimized implementation
-    if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
+    if config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4] and _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import (
             float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
         )
@@ -508,17 +514,30 @@ def integer_quantize_dequantize_weight(
         return decompressed_weight
 
 
-def _calculate_nf4_quantized_weight(norm_weight: Tensor) -> Tensor:
+def _calculate_float_quantized_weight(norm_weight: Tensor, mode: CompressWeightsMode) -> Tensor:
     """
-    Performs NF4 quantization. Look-up table is used to "round" or "quantize" to the closest quant.
+    Performs float (currently NF4 or MXFP4) quantization. Look-up table is used to "round" or "quantize" to the
+    closest quant.
 
-    :param norm_weight: Weight tensor to quantize already normalized to [-1, 1] range.
-    :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants on [-1, 1].
+    :param norm_weight: Normalized weight tensor to quantize.
+    :return: Tensor with floating-point values, where each of them corresponds to 1 out of 16 quants.
     """
-    center_nf4_quantiles = fns.from_numpy(CENTER_OF_NF4_QUANTILES, backend=norm_weight.backend)
-    indexes = fns.searchsorted(center_nf4_quantiles, norm_weight)
-    nf4_quantiles = fns.from_numpy(NF4_QUANTILES, backend=indexes.backend)
-    quantized_weight = nf4_quantiles[indexes]
+    assert mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4]
+    quantiles_np = NF4_QUANTILES if mode == CompressWeightsMode.NF4 else MXFP4_QUANTILES
+    quantile_centers_np = CENTER_OF_NF4_QUANTILES if mode == CompressWeightsMode.NF4 else CENTER_OF_MXFP4_QUANTILES
+    quantile_centers = fns.from_numpy(quantile_centers_np, backend=norm_weight.backend)
+    indexes = fns.searchsorted(quantile_centers, norm_weight)
+    quantiles = fns.from_numpy(quantiles_np, backend=indexes.backend)
+
+    if mode == CompressWeightsMode.MXFP4:
+        # If in-between two quantiles, round to the nearest even quantile.
+        shifted_indexes = fns.clip(indexes + 1, 0, quantiles.size - 1)
+        dist_left = fns.abs(norm_weight - quantiles[indexes])
+        dist_right = fns.abs(norm_weight - quantiles[shifted_indexes])
+        choose_right = (dist_right < dist_left) | ((dist_left == dist_right) & ((shifted_indexes + 1) % 2 == 0))
+        indexes = fns.where(choose_right, shifted_indexes, indexes)
+
+    quantized_weight = quantiles[indexes]
     return quantized_weight
 
 

@@ -44,6 +44,8 @@ class TensorDataType(StrEnum):
     float64 = auto()
     f8e4m3 = auto()
     f8e5m2 = auto()
+    f8e8m0 = auto()
+    f4e2m1 = auto()
     nf4 = auto()
     int8 = auto()
     int32 = auto()

@@ -23,6 +23,8 @@
 
 DTYPE_MAP: dict[TensorDataType, ov.Type] = {
     TensorDataType.nf4: ov.Type.nf4,
+    TensorDataType.f4e2m1: ov.Type.f4e2m1,
+    TensorDataType.f8e8m0: ov.Type.f8e8m0,
     TensorDataType.f8e4m3: ov.Type.f8e4m3,
     TensorDataType.f8e5m2: ov.Type.f8e5m2,
     TensorDataType.float16: ov.Type.f16,
@@ -42,6 +44,8 @@
     TensorDataType.int4,
     TensorDataType.uint4,
     TensorDataType.nf4,
+    TensorDataType.f4e2m1,
+    TensorDataType.f8e8m0,
     TensorDataType.f8e4m3,
     TensorDataType.f8e5m2,
 ]
@@ -95,7 +99,7 @@ def _(a: ov.Tensor, shape: Union[int, tuple[int, ...]]) -> ov.Tensor:
 
 @numeric.as_numpy_tensor.register
 def _(a: ov.Tensor) -> NDArray[Any]:
-    # Cannot convert bfloat16, uint4, int4, nf4, f8e4m3, f8e5m2 to numpy directly
+    # Cannot convert bfloat16, uint4, int4, nf4, f4e2m1, f8e8m0, f8e4m3, f8e5m2 to numpy directly
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
     if a_dtype in NATIVE_OV_CAST_DTYPES:
         dtype = TensorDataType.float32

@@ -84,6 +84,12 @@ def __len__(self) -> int:
 
     # built-in operations
 
+    def __or__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
+        return Tensor(self.data | unwrap_tensor_data(other))
+
+    def __and__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
+        return Tensor(self.data & unwrap_tensor_data(other))
+
     def __add__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
         return Tensor(self.data + unwrap_tensor_data(other))
 
@@ -144,6 +150,9 @@ def __ifloordiv__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
         self._data //= unwrap_tensor_data(other)
         return self
 
+    def __mod__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
+        return cast(Tensor, _call_function("_binary_op_nowarn", self, other, operator.mod))
+
     def __matmul__(self, other: Union[Tensor, T_NUMBER]) -> Tensor:
         return Tensor(self.data @ unwrap_tensor_data(other))