openvinotoolkit · daniil-lyakhov · Oct 8, 2025 · Oct 9, 2025 · Oct 13, 2025
@@ -13,6 +13,7 @@
   - NF4 compression mode
   - Arbitrary look-up table (CODEBOOK) or predefined lookup table based on NF4 (CB4_F8E4M3)
   - MX-compliant types - MXFP4 and MXFP8_E4M3
+  - FP8 type - FP8_E4M3
   - Mixed precision weights compression
   - Grouped weights compression
 

@@ -47,6 +47,7 @@ NNCF can automatically distribute precision assignments based on quantization se
 | CB4_F8E4M3       | E4M3         | FP16       | Per-channel / Group-wise | A fixed lookup table with 16 E4M3 values based on NF4 values |
 | MXFP4            | E2M1         | E8M0       | Group-wise (32)          | [MX-compliant FP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
 | MXFP8_E4M3       | E4M3         | E8M0       | Group-wise (32)          | [MX-compliant FP8](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) |
+|   FP8_E4M3       | E4M3         | FP16       | Per-channel / Group-wise | [FP8](https://arxiv.org/pdf/2209.05433) |
 
 **Note**: Granularity refers to the scope of elements sharing quantization parameters. "Per-channel" applies different parameters for each output channel, while "Group-wise" divides weights into groups (e.g., group_size=128) that share the same parameters.
 

@@ -92,6 +92,7 @@ class CompressWeightsMode(StrEnum):
     :param INT8: Mode is deprecated and will be removed in future releases. Please use `INT8_ASYM` instead.
     :param MXFP4: MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
     :param MXFP8_E4M3: MX-compliant FP8 format with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
+    :param FP8_E4M3: A FP8 format with E4M3 values sharing group-level fp16 scale.
     :param CODEBOOK: Codebook (LUT) quantization format.
     :param CB4_F8E4M3: Codebook (LUT) format with 16 fixed fp8 values in E4M3 format.
     """
@@ -105,6 +106,7 @@ class CompressWeightsMode(StrEnum):
     INT8 = "int8"  # Deprecated mode
     MXFP4 = "mxfp4"
     MXFP8_E4M3 = "mxfp8_e4m3"
+    FP8_E4M3 = "fp8_e4m3"
     CODEBOOK = "codebook"
 
 

@@ -65,6 +65,7 @@
     CompressWeightsMode.NF4,
     CompressWeightsMode.MXFP4,
     CompressWeightsMode.MXFP8_E4M3,
+    CompressWeightsMode.FP8_E4M3,
 ]
 SUPPORTED_DATA_TYPES = [
     TensorDataType.float16,
@@ -300,6 +301,7 @@ def __init__(
             NF4 is the same as INT4_SYM mode, but primary precision is NF4 data type without zero point.
             MXFP4 is MX-compliant FP4 with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
             MXFP8_E4M3 is MX-compliant FP8 with E4M3 values sharing group-level E8M0 scale. The size of group is 32.
+            FP8_E4M3 is FP8 with E4M3 values sharing group-level FP16 scale.
         :param ratio: the ratio between primary and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
             and the rest to backup_mode).
         :param group_size: number of weights (e.g. 128) in the channel dimension

@@ -61,6 +61,7 @@ def is_integer(self):
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
         ]

@@ -231,6 +231,8 @@ def _create_compression_subgraph(
         elif compression_config.mode == CompressWeightsMode.MXFP8_E4M3:
             compression_dtype = ov.Type.f8e4m3
             scale_dtype = ov.Type.f8e8m0
+        elif compression_config.mode == CompressWeightsMode.FP8_E4M3:
+            compression_dtype = ov.Type.f8e4m3
         elif compression_config.mode == CompressWeightsMode.INT4_SYM:
             compression_dtype = ov.Type.i4
         elif compression_config.mode == CompressWeightsMode.INT4_ASYM:

@@ -440,6 +440,7 @@ def transform_model(
                 CompressWeightsMode.NF4,
                 CompressWeightsMode.MXFP4,
                 CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
             ]:
                 msg = f"{compression_config.mode.value} is not supported."
                 raise nncf.ParameterNotSupportedError(msg)

@@ -180,6 +180,7 @@ def transform_model(
                 CompressWeightsMode.NF4,
                 CompressWeightsMode.MXFP4,
                 CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
             ]:
                 msg = f"{compression_config.mode.value} is not supported."
                 raise nncf.ParameterNotSupportedError(msg)

@@ -81,7 +81,7 @@ def calculate_float_quantization_params(
     weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
 ) -> Tensor:
     """
-    Calculates the scale for nf4 or mxfp4/mxfp8_e4m3 quantization.
+    Calculates the scale for nf4 or mxfp4/mxfp8_e4m3/fp8_e4m3 quantization.
 
     :param weight: Weight array to compress.
     :param reduction_axes: Axes along which to reduce (collect) different statistics (e.g., min, max).
@@ -97,6 +97,7 @@ def calculate_float_quantization_params(
     FP_MAX_VALS = {
         CompressWeightsMode.MXFP4: 6.0,
         CompressWeightsMode.MXFP8_E4M3: 448.0,
+        CompressWeightsMode.FP8_E4M3: 448.0,
     }
     if config.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3] + list(FP_MAX_VALS.keys()):
         if config.mode in FP_MAX_VALS:
@@ -146,17 +147,17 @@ def do_float_quantization(
 ) -> tuple[Tensor, Tensor, Tensor]:
     """
     Computes quantization scale if not provided,
-    and performs corresponding (nf4, MXFP4 and MXFP8_E4M3) weight quantization.
+    and performs corresponding (nf4, MXFP4, MXFP8_E4M3, FP8_E4M3) weight quantization.
     For NF4 quantization quantizes the weights to 16 levels on [-1, 1] interval.
-    For MXFP4, MXFP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
-    TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+    For MXFP4, MXFP8_E4M3, FP8_E4M3 and CODEBOOK currently returns normalized weight without quantization.
+    TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3 and FP8_E4M3 once ticket 164851 is resolved
 
     :param weight: Weight array to compress.
     :param config: Weight compression configuration.
     :param reduction_axes: Axes, along which to reduce (collect) different statistics.
     :param precomputed_scale: Optional precomputed scale.
-    :return: Returns quantized (for MXFP4 and MXFP8_E4M3 normalized) weight tensor and corresponding scale tensor and
-             optional indexes for codebook.
+    :return: Returns quantized (for MXFP4, MXFP8_E4M3 and FP8_E4M3 normalized) weight tensor and
+        corresponding scale tensor and optional indexes for codebook.
     """
     assert not config.is_integer
 
@@ -192,7 +193,7 @@ def do_float_quantization(
         )
         return compressed_weight, scale, indexes
     else:
-        # TODO(nikita-savelyevv): add support for MXFP4 and MXFP8_E4M3 once ticket 164851 is resolved
+        # TODO(nikita-savelyevv): add support for MXFP4, MXFP8_E4M3, FP8_E4M3 once ticket 164851 is resolved
         compressed_weight = norm_weight
     return compressed_weight, scale, None
 

@@ -460,6 +460,7 @@ def compress_weights(
         MXFP4 is MX-compliant FP4 format with E2M1 values sharing group-level E8M0 scale. The size of group is 32.
         MXFP8_E4M3 - is MX-compliant FP8 format with E4M3 values sharing a group-level E8M0 scale.
             The size of group is 32.
+        FP8_E4M3 - is FP8 format with E4M3 values sharing a group-level FP16 scale.
     :type mode: nncf.CompressWeightsMode
     :param ratio: the ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to NF4
         and the rest to INT8_ASYM).
@@ -517,14 +518,18 @@ def compress_weights(
         from nncf.torch.nncf_network import NNCFNetwork
         from nncf.torch.quantization.quantize_model import compress_weights_impl as pt_compression_weights_impl
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {"gptq": gptq, "lora_correction": lora_correction}
@@ -567,14 +572,18 @@ def compress_weights(
             compress_weights_impl as fx_compression_weights_impl,
         )
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "Torch backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"Torch backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {
@@ -610,14 +619,18 @@ def compress_weights(
             msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None."
             raise nncf.ParameterNotSupportedError(msg)
 
-        if any((awq, scale_estimation, gptq, lora_correction)) and mode in [
-            CompressWeightsMode.MXFP4,
-            CompressWeightsMode.MXFP8_E4M3,
-        ]:
-            msg = (
-                "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode in [MXFP4, MXFP8_E4M3]."
-            )
-            raise nncf.ParameterNotSupportedError(msg)
+        if any((awq, scale_estimation, gptq, lora_correction)):
+            not_supported_modes = [
+                CompressWeightsMode.MXFP4,
+                CompressWeightsMode.MXFP8_E4M3,
+                CompressWeightsMode.FP8_E4M3,
+            ]
+            if mode in not_supported_modes:
+                msg = (
+                    "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined,"
+                    f" but mode in {[m.value for m in not_supported_modes]}."
+                )
+                raise nncf.ParameterNotSupportedError(msg)
 
         if gptq and lora_correction:
             msg = "Simultaneous use of Lora correction and GPTQ algorithms is not supported. Select one of them."
@@ -632,14 +645,18 @@ def compress_weights(
     elif backend == BackendType.ONNX:
         from nncf.onnx.quantization.quantize_model import compress_weights_impl as onnx_compress_weights_impl
 
-        if mode in [
+        not_supported_modes = [
             CompressWeightsMode.NF4,
             CompressWeightsMode.MXFP4,
             CompressWeightsMode.MXFP8_E4M3,
+            CompressWeightsMode.FP8_E4M3,
             CompressWeightsMode.CODEBOOK,
             CompressWeightsMode.CB4_F8E4M3,
-        ]:
-            msg = "ONNX backend does not support NF4, MXFP4, MXFP8_E4M3 and CODEBOOK modes for weight compression."
+        ]
+        if mode in not_supported_modes:
+            msg = (
+                f"ONNX backend does not support {[m.value for m in not_supported_modes]} modes for weight compression."
+            )
             raise nncf.ParameterNotSupportedError(msg)
 
         options = {

@@ -9,20 +9,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
+import os
 from pathlib import Path
 
 import pytest
+from pytest import Config
 
+from nncf import set_log_level
 from tests.cross_fw.shared.case_collection import COMMON_SCOPE_MARKS_VS_OPTIONS
 from tests.cross_fw.shared.case_collection import skip_marked_cases_if_options_not_specified
 from tests.cross_fw.shared.install_fixtures import tmp_venv_with_nncf  # noqa: F401
 from tests.cross_fw.shared.paths import TEST_ROOT
 
 
 def pytest_addoption(parser):
+    parser.addoption(
+        "--regen-ref-data",
+        action="store_true",
+        default=False,
+        help="If specified, the reference files will be regenerated using the current state of the repository.",
+    )
+    parser.addoption(
+        "--nncf-debug",
+        action="store_true",
+        default=False,
+        help="Set debug level for nncf logger.",
+    )
     parser.addoption("--data", type=str, default=None, help="Directory path to cached data.")
 
 
+def pytest_configure(config: Config) -> None:
+    regen_dot = config.getoption("--regen-ref-data", False)
+    if regen_dot:
+        os.environ["NNCF_TEST_REGEN_DOT"] = "1"
+
+    nncf_debug = config.getoption("--nncf-debug", False)
+    if nncf_debug:
+        set_log_level(logging.DEBUG)
+
+
 @pytest.fixture(name="data_dir")
 def data(request):
     option = request.config.getoption("--data")

@@ -1,5 +1,33 @@
 {
     "matmul_2_data": {
+        "compressed_weight": [
+            55,
+            249,
+            21,
+            56,
+            162,
+            197,
+            244,
+            38,
+            251,
+            248,
+            185,
+            255,
+            207,
+            223,
+            234,
+            216,
+            253,
+            178,
+            254,
+            208,
+            255
+        ],
+        "zero_point": [
+            0,
+            0,
+            0
+        ],
         "scale": [
             [
                 [

@@ -1,5 +1,28 @@
 {
     "matmul_2_data": {
+        "compressed_weight": [
+            -19,
+            -117,
+            -2,
+            -20,
+            -65,
+            -83,
+            -114,
+            -3,
+            -117,
+            -116,
+            -36,
+            -117,
+            -56,
+            -54,
+            -101,
+            -84,
+            -118,
+            -81,
+            -120,
+            -112,
+            -120
+        ],
         "scale": [
             [
                 [

@@ -1,5 +1,28 @@
 {
     "matmul_2_data": {
+        "compressed_weight": [
+            156,
+            253,
+            138,
+            173,
+            216,
+            235,
+            250,
+            139,
+            254,
+            252,
+            205,
+            253,
+            207,
+            206,
+            237,
+            236,
+            254,
+            233,
+            255,
+            247,
+            255
+        ],
         "scale": [
             [
                 [

@@ -1,5 +1,23 @@
 {
     "matmul_2_data": {
+        "compressed_weight": [
+            198,
+            194,
+            191,
+            187,
+            184,
+            178,
+            170,
+            0,
+            41,
+            49,
+            54,
+            57,
+            60,
+            64,
+            66,
+            70
+        ],
         "scale": [
             [
                 [