Add support for cb4_f8e4m3 quantization mode. (#1378)

nikita-savelyevv · mvafin · commit a26eb13e25ac · 2025-08-12T16:54:18.000+02:00
* Add support for cb4_f8e4m3 compression format.

* Apply suggestions

* Update setup.py

* Style

* Make the option available only if nncf develop is installed

* Update condition

* Fix

* Tweak tests

* Style

* Do not remove nf4_f8 modes

* Remove unnesseccary check

* Trigger tests

* Add deprecation warning
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -31,8 +31,8 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
-                                   [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}]
+                                   [--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -66,9 +66,10 @@ Optional arguments:
   --trust-remote-code   Allows to use custom code for the modeling hosted in the model repository. This option should
                         only be set for repositories you trust and in which you have read the code, as it will execute
                         on your local machine arbitrary code present in the model repository.
-  --weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
-                        The weight format of the exported model.
-  --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}
+  --weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}
+                        The weight format of the exported model. Option 'cb4' represents a codebook with 16
+                        fixed fp8 values in E4M3 format.
+  --quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}
                         Quantization precision mode. This is used for applying full model quantization including
                         activations.
   --library {transformers,diffusers,timm,sentence_transformers,open_clip}
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -22,7 +22,11 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
-from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available, is_nncf_available
+from ...intel.utils.import_utils import (
+    DIFFUSERS_IMPORT_ERROR,
+    is_diffusers_available,
+    is_nncf_available,
+)
 from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
 from ...utils.save_utils import maybe_load_preprocessors
 from ..base import BaseOptimumCLICommand, CommandInfo
@@ -72,14 +76,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4", "cb4"],
         default=None,
-        help="The weight format of the exported model.",
+        help=(
+            "The weight format of the exported model. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format."
+        ),
     )
     optional_group.add_argument(
         "--quant-mode",
         type=str,
-        choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"],
+        choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "cb4_f8e4m3", "int4_f8e4m3", "int4_f8e5m2"],
         default=None,
         help=(
             "Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -392,7 +398,13 @@ def run(self):
                         raise ValueError(
                             "Dataset is required for full quantization. Please provide it with --dataset argument."
                         )
-                    if self.args.quant_mode in ["nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"]:
+                    if self.args.quant_mode in [
+                        "nf4_f8e4m3",
+                        "nf4_f8e5m2",
+                        "cb4_f8e4m3",
+                        "int4_f8e4m3",
+                        "int4_f8e5m2",
+                    ]:
                         if library_name == "diffusers":
                             raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
 
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -660,7 +660,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
         dtype (`str`, *optional*):
-            Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4'].
+            Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4'].
+            Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format.
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
             difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
@@ -845,11 +846,17 @@ def post_init(self):
 
         if self.dtype is None:
             self.dtype = "int4" if self.bits == 4 else "int8"
-        if self.dtype not in ["int4", "int8", "mxfp4", "nf4"]:
+        if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4"]:
             raise ValueError(
-                f"Weights quantization data type must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.dtype}."
+                "Weights quantization data type must be one of the following: "
+                f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4'], but found: {self.dtype}."
             )
-        if self.dtype in ["mxfp4", "nf4"]:
+        if self.dtype in ["mxfp4", "nf4", "cb4"]:
+            if self.dtype == "cb4" and is_nncf_version("<=", "2.17"):
+                raise ImportError(
+                    "Codebook quantization is currently supported only with NNCF develop. "
+                    "Please run `pip install git+https://github.com/openvinotoolkit/nncf.git`."
+                )
             if self.bits != 4:
                 raise ValueError(
                     f"When applying weight compression with '{self.dtype}' data type, the `bits` parameter must be set to 4, but found {self.bits}"
@@ -877,6 +884,8 @@ def to_nncf_dict(self) -> Dict[str, Any]:
             mode += "_sym" if self.sym else "_asym"
         if mode == "mxfp4":
             mode = "e2m1"
+        if mode == "cb4":
+            mode = "cb4_f8e4m3"
         mode = nncf.CompressWeightsMode(mode)
 
         awq = True if self.quant_method == OVQuantizationMethod.AWQ else None
@@ -1243,6 +1252,18 @@ def __init__(
 
         self.post_init()
 
+    def post_init(self):
+        super().post_init()
+
+        if self.weight_quantization_config.dtype == "nf4" and self.full_quantization_config.dtype in [
+            "f8e4m3",
+            "f8e5m2",
+        ]:
+            logger.warning(
+                "\n`nf4_f8e4m3` and `nf4_f8e5m2` mixed precision quantization modes are deprecated and will be "
+                "removed in optimum-intel v1.26. Please use `cb4_f8e4m3` instead.\n"
+            )
+
     @staticmethod
     def _initialize_quantization_config(
         config: Union[dict, OVWeightQuantizationConfig, OVQuantizationConfig],
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -18,6 +18,7 @@
 from typing import Dict
 from unittest.mock import Mock
 
+import pytest
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoModelForZeroShotImageClassification, AutoProcessor, AutoTokenizer
 from utils_tests import (
@@ -60,6 +61,7 @@
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS, TemporaryDirectory
 from optimum.intel.utils.import_utils import (
     compare_versions,
+    is_nncf_version,
     is_openvino_tokenizers_available,
     is_openvino_version,
     is_tokenizers_version,
@@ -260,6 +262,18 @@ class OVCLIExportTestCase(unittest.TestCase):
                 "model": {"f8e4m3": 11, "nf4": 5},
             },
         ),
+        (
+            "text-generation",
+            "llama",
+            "cb4_f8e4m3",
+            "--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --ratio 0.5",
+            {
+                "model": 16,
+            },
+            {
+                "model": {"int8": 5, "int4": 5, "f8e4m3": 16},
+            },
+        ),
         (
             "text-generation",
             "llama",
@@ -467,6 +481,12 @@ class OVCLIExportTestCase(unittest.TestCase):
             "nf4",
             {"model": {"int8": 4, "nf4": 72}},
         ),
+        (
+            "text-generation-with-past",
+            "gpt2",
+            "cb4 --group-size 32",
+            {"model": {"int8": 24, "int4": 20, "f8e4m3": 20}},
+        ),
         (
             "text-generation-with-past",
             "llama_awq",
@@ -977,6 +997,8 @@ def test_exporters_cli_hybrid_quantization(
     def test_exporters_cli_4bit(
         self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]]
     ):
+        if option.startswith("cb4") and is_nncf_version("<=", "2.17"):
+            pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -1014,6 +1036,8 @@ def test_exporters_cli_full_quantization(
         expected_fake_nodes_per_model: Dict[str, int],
         expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]],
     ):
+        if quant_mode == "cb4_f8e4m3" and is_nncf_version("<=", "2.17"):
+            pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
                 f"optimum-cli export openvino --task {task} --model {MODEL_NAMES[model_type]} "
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -80,7 +80,7 @@
 from copy import deepcopy
 
 from optimum.intel.openvino.quantization import InferRequestWrapper, OVCalibrationDatasetBuilder
-from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
+from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version, is_nncf_version
 from utils_tests import (
     MODEL_NAMES,
     get_num_quantized_nodes,
@@ -158,8 +158,8 @@ class OVQuantizerTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama",
-            OVMixedQuantizationConfig(
-                weight_quantization_config=OVWeightQuantizationConfig(
+            dict(
+                weight_quantization_config=dict(
                     bits=4,
                     dtype="nf4",
                     group_size=16,
@@ -180,6 +180,31 @@ class OVQuantizerTest(unittest.TestCase):
                 "model": {"f8e4m3": 8, "nf4": 2},
             },
         ),
+        (
+            OVModelForCausalLM,
+            "llama",
+            dict(
+                weight_quantization_config=dict(
+                    bits=4,
+                    dtype="cb4",
+                    group_size=16,
+                    ratio=0.5,
+                    ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.self_attn"]},
+                ),
+                full_quantization_config=OVQuantizationConfig(
+                    dtype="f8e4m3", ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.mlp"]}
+                ),
+                ignored_scope={"patterns": [f"{pattern_prefix}.layers.1.self_attn"]},
+                dataset="wikitext2",
+                num_samples=1,
+            ),
+            {
+                "model": 8,
+            },
+            {
+                "model": {"int8": 2, "int4": 2, "f8e4m3": 10},
+            },
+        ),
         (
             OVModelForCausalLM,
             "llama",
@@ -597,6 +622,12 @@ def test_ov_model_static_quantization_with_auto_dataset(
         expected_fake_nodes_per_model,
         expected_num_weight_nodes_per_model,
     ):
+        if (
+            isinstance(quantization_config, dict)
+            and quantization_config.get("weight_quantization_config", {}).get("dtype") == "cb4"
+            and is_nncf_version("<=", "2.17")
+        ):
+            pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
         model_id = MODEL_NAMES[model_name]
 
         with TemporaryDirectory() as tmp_dir:
@@ -689,6 +720,13 @@ class OVWeightCompressionTest(unittest.TestCase):
             dict(bits=4, dtype="nf4", group_size=32),
             {"model": {"int8": 4, "nf4": 20}},
         ),
+        (
+            OVModelForCausalLM,
+            "gpt2",
+            False,
+            dict(bits=4, dtype="cb4", group_size=32),
+            {"model": {"int8": 24, "int4": 20, "f8e4m3": 20}},
+        ),
         (
             OVModelForCausalLM,
             "gpt2",
@@ -1345,6 +1383,13 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
     def test_ovmodel_4bit_auto_compression_with_config(
         self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model
     ):
+        if (
+            isinstance(quantization_config, dict)
+            and quantization_config.get("dtype") == "cb4"
+            and is_nncf_version("<=", "2.17")
+        ):
+            pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
+
         model_id = MODEL_NAMES[model_name]
         with TemporaryDirectory() as tmp_dir:
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)