openvinotoolkit
diff --git a/‎src/nncf/openvino/optimized_functions/functions.py‎
Lines changed: 11 additions & 5 deletions b/‎src/nncf/openvino/optimized_functions/functions.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎src/nncf/openvino/optimized_functions/models.py‎
Lines changed: 11 additions & 13 deletions b/‎src/nncf/openvino/optimized_functions/models.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/config.py‎
Lines changed: 33 additions & 1 deletion b/‎src/nncf/quantization/algorithms/weight_compression/config.py‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/nncf/quantization/algorithms/weight_compression/constants.py‎
Lines changed: 33 additions & 0 deletions b/‎src/nncf/quantization/algorithms/weight_compression/constants.py‎
Lines changed: 33 additions & 0 deletions
@@ -12,7 +12,6 @@
 from typing import Optional, Union
 
 import nncf
-from nncf import CompressWeightsMode
 from nncf.common.utils.caching import disable_results_caching
 from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
 from nncf.openvino.optimized_functions.models import OVModelParameters
@@ -23,6 +22,8 @@
 from nncf.openvino.optimized_functions.models import get_integer_quantization_model
 from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
+from nncf.quantization.algorithms.weight_compression.constants import OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorBackend
@@ -49,6 +50,8 @@ def do_integer_quantization(
     :param precomputed_zero_point: Optional precomputed zero point tensor.
     :return: A tuple containing the compressed weights, scale, and zero point tensors.
     """
+    assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
+
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
@@ -117,7 +120,7 @@ def do_float_quantization(
     :param precomputed_scale: Optional precomputed scale.
     :return: Returns quantized weight tensor and corresponding scale tensor.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
+    assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
 
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
@@ -129,8 +132,7 @@ def do_float_quantization(
     if weight.backend == TensorBackend.ov:
         # Return ov tensors in target precision to seamlessly insert them into openvino model later
         ov_model_params.return_ov_tensors = True
-        weight_dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
-        ov_model_params.output_dtypes.update({"compressed_weight": weight_dtype})
+        ov_model_params.output_dtypes.update({"compressed_weight": config.compression_dtype})
 
     model = get_float_quantization_model(
         ov_model_params,
@@ -177,6 +179,8 @@ def integer_quantize_dequantize_weight(
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
         (and zero point).
     """
+    assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
+
     # When reduction axes are not provided, assuming that the weights are already reshaped
     if config.group_size != -1 and reduction_axes is not None:
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
@@ -235,7 +239,7 @@ def float_quantize_dequantize_weight(
     :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight and scale.
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight and scale.
     """
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
+    assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES
 
     # When reduction axes are not provided, assuming that the weights are already reshaped
     if config.group_size != -1 and reduction_axes is not None:
@@ -290,6 +294,8 @@ def get_integer_quantization_error(
     :param reduction: Reduction mode to aggregate error values. Supported modes: "max_mean", "frobenius".
     :return: The quantity characterizing the error of integer quantization.
     """
+    assert config.mode in OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES
+
     if reduction not in ["max_mean", "frobenius"]:
         exception_str = f"Unsupported aggregation mode: {reduction}."
         raise nncf.InternalError(exception_str)
 
@@ -31,6 +31,7 @@
 from nncf.openvino.graph.node_utils import convert_op
 from nncf.openvino.graph.node_utils import non_convertable_divide_op
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.constants import FP_MAX_VALUES
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.functions.openvino_numeric import DTYPE_MAP as DTYPE_MAP_OV
@@ -579,8 +580,6 @@ def _build_float_quantization_model(
     reduction_axes: Optional[ReductionAxes] = None,
     return_nodes: bool = False,
 ) -> Union[ModelCallable, ModelAsNodes]:
-    assert config.mode in [CompressWeightsMode.NF4, CompressWeightsMode.MXFP4, CompressWeightsMode.FP4]
-
     default_input_dtypes = {"scale": TensorDataType.float32}
     default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32}
 
@@ -605,7 +604,12 @@ def _build_float_quantization_model(
     )
 
     # Validate output dtypes
-    valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4, TensorDataType.f4e2m1]
+    valid_compressed_weight_dtypes = [
+        TensorDataType.float32,
+        TensorDataType.nf4,
+        TensorDataType.f4e2m1,
+        TensorDataType.f8e4m3,
+    ]
     if compressed_weight_dtype not in valid_compressed_weight_dtypes:
         msg = (
             f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
@@ -633,23 +637,17 @@ def _build_float_quantization_model(
         eps = np.finfo(np.float32).eps
         scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale)
 
-        # Equals 1.0 for NF4
-        FP_MAX_VALS = {
-            CompressWeightsMode.MXFP4: 6.0,
-            CompressWeightsMode.FP4: 6.0,
-        }
-        if config.mode in FP_MAX_VALS:
-            scale = divide_op(scale, opset.constant(FP_MAX_VALS[config.mode], ov.Type.f32))
+        if config.compression_dtype != TensorDataType.nf4:
+            scale = divide_op(scale, opset.constant(FP_MAX_VALUES[config.compression_dtype], ov.Type.f32))
 
-        if config.mode == CompressWeightsMode.MXFP4:
+        if config.mode in [CompressWeightsMode.MXFP4, CompressWeightsMode.MXFP8_E4M3]:
             scale = opset.log(scale) / opset.log(opset.constant(2.0, ov.Type.f32))
             scale = opset.ceil(scale)
             scale = opset.clamp(scale, -127.0, 127.0)
             scale = opset.power(opset.constant(2.0, ov.Type.f32), scale)
 
     compressed_weight = divide_op(weight, scale)
-    target_dtype = ov.Type.nf4 if config.mode == CompressWeightsMode.NF4 else ov.Type.f4e2m1
-    compressed_weight = convert_op(compressed_weight, target_dtype)
+    compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[config.compression_dtype])
     compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
 
     ov_results = [compressed_weight]
 
@@ -46,7 +46,14 @@ def num_bits(self):
         """
         :return: number of bits that is used for storing a single quantized value in the given mode.
         """
-        return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4
+        if self.mode in [
+            CompressWeightsMode.INT8_SYM,
+            CompressWeightsMode.INT8_ASYM,
+            CompressWeightsMode.FP8_E4M3,
+            CompressWeightsMode.MXFP8_E4M3,
+        ]:
+            return 8
+        return 4
 
     @property
     def is_asym_mode(self):
@@ -74,6 +81,31 @@ def is_codebook(self):
         """
         return self.mode in [CompressWeightsMode.CODEBOOK, CompressWeightsMode.CB4_F8E4M3]
 
+    @property
+    def compression_dtype(self) -> TensorDataType:
+        """
+        :return: data type that is used to store compressed weights.
+        """
+        if self.is_codebook:
+            n_quants = self.codebook_values.size
+            if n_quants <= 16:
+                return TensorDataType.uint4
+            if n_quants <= 256:
+                return TensorDataType.uint8
+            return TensorDataType.uint16
+        dtype_per_mode = {
+            CompressWeightsMode.INT4_SYM: TensorDataType.int4,
+            CompressWeightsMode.INT4_ASYM: TensorDataType.uint4,
+            CompressWeightsMode.INT8_ASYM: TensorDataType.uint8,
+            CompressWeightsMode.INT8_SYM: TensorDataType.int8,
+            CompressWeightsMode.NF4: TensorDataType.nf4,
+            CompressWeightsMode.FP4: TensorDataType.f4e2m1,
+            CompressWeightsMode.MXFP4: TensorDataType.f4e2m1,
+            CompressWeightsMode.FP8_E4M3: TensorDataType.f8e4m3,
+            CompressWeightsMode.MXFP8_E4M3: TensorDataType.f8e4m3,
+        }
+        return dtype_per_mode[self.mode]
+
     def get_numpy_codebook(self):
         return self.codebook_values.as_numpy_tensor()
 
 
@@ -11,6 +11,9 @@
 
 import numpy as np
 
+from nncf.parameters import CompressWeightsMode
+from nncf.tensor import TensorDataType
+
 NF4_QUANTILES = np.array(
     [
         -1.0,
@@ -101,3 +104,33 @@
 
 
 CENTER_OF_F4E2M1_QUANTILES = (F4E2M1_QUANTILES[1:] + F4E2M1_QUANTILES[:-1]) / 2
+
+
+FP_MAX_VALUES = {
+    TensorDataType.nf4: 1.0,
+    TensorDataType.f4e2m1: 6.0,
+    TensorDataType.f8e4m3: 448.0,
+}
+
+
+MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
+
+OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES = (
+    CompressWeightsMode.INT8_ASYM,
+    CompressWeightsMode.INT8_SYM,
+    CompressWeightsMode.INT4_ASYM,
+    CompressWeightsMode.INT4_SYM,
+)
+
+OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES = (
+    CompressWeightsMode.NF4,
+    CompressWeightsMode.MXFP4,
+    CompressWeightsMode.FP4,
+    CompressWeightsMode.FP8_E4M3,
+    CompressWeightsMode.MXFP8_E4M3,
+)
+
+OPTIMIZED_COMPRESSION_COMPATIBLE_MODES = (
+    *OPTIMIZED_COMPRESSION_COMPATIBLE_INT_MODES,
+    *OPTIMIZED_COMPRESSION_COMPATIBLE_FLOAT_MODES,
+)