Skip to content

Commit a26eb13

Browse files
nikita-savelyevvmvafin
authored andcommitted
Add support for cb4_f8e4m3 quantization mode. (#1378)
* Add support for cb4_f8e4m3 compression format. * Apply suggestions * Update setup.py * Style * Make the option available only if nncf develop is installed * Update condition * Fix * Tweak tests * Style * Do not remove nf4_f8 modes * Remove unnesseccary check * Trigger tests * Add deprecation warning
1 parent 409c89a commit a26eb13

File tree

5 files changed

+120
-17
lines changed

5 files changed

+120
-17
lines changed

docs/source/openvino/export.mdx

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ Check out the help for more options:
3131

3232
```text
3333
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
34-
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
35-
[--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}]
34+
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}]
35+
[--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}]
3636
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
3737
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
3838
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
@@ -66,9 +66,10 @@ Optional arguments:
6666
--trust-remote-code Allows to use custom code for the modeling hosted in the model repository. This option should
6767
only be set for repositories you trust and in which you have read the code, as it will execute
6868
on your local machine arbitrary code present in the model repository.
69-
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
70-
The weight format of the exported model.
71-
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,int4_f8e4m3,int4_f8e5m2}
69+
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}
70+
The weight format of the exported model. Option 'cb4' represents a codebook with 16
71+
fixed fp8 values in E4M3 format.
72+
--quant-mode {int8,f8e4m3,f8e5m2,nf4_f8e4m3,nf4_f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}
7273
Quantization precision mode. This is used for applying full model quantization including
7374
activations.
7475
--library {transformers,diffusers,timm,sentence_transformers,open_clip}

optimum/commands/export/openvino.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@
2222
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
2323

2424
from ...exporters import TasksManager
25-
from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available, is_nncf_available
25+
from ...intel.utils.import_utils import (
26+
DIFFUSERS_IMPORT_ERROR,
27+
is_diffusers_available,
28+
is_nncf_available,
29+
)
2630
from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
2731
from ...utils.save_utils import maybe_load_preprocessors
2832
from ..base import BaseOptimumCLICommand, CommandInfo
@@ -72,14 +76,16 @@ def parse_args_openvino(parser: "ArgumentParser"):
7276
optional_group.add_argument(
7377
"--weight-format",
7478
type=str,
75-
choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4"],
79+
choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4", "cb4"],
7680
default=None,
77-
help="The weight format of the exported model.",
81+
help=(
82+
"The weight format of the exported model. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format."
83+
),
7884
)
7985
optional_group.add_argument(
8086
"--quant-mode",
8187
type=str,
82-
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"],
88+
choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "cb4_f8e4m3", "int4_f8e4m3", "int4_f8e5m2"],
8389
default=None,
8490
help=(
8591
"Quantization precision mode. This is used for applying full model quantization including activations. "
@@ -392,7 +398,13 @@ def run(self):
392398
raise ValueError(
393399
"Dataset is required for full quantization. Please provide it with --dataset argument."
394400
)
395-
if self.args.quant_mode in ["nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"]:
401+
if self.args.quant_mode in [
402+
"nf4_f8e4m3",
403+
"nf4_f8e5m2",
404+
"cb4_f8e4m3",
405+
"int4_f8e4m3",
406+
"int4_f8e5m2",
407+
]:
396408
if library_name == "diffusers":
397409
raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
398410

optimum/intel/openvino/configuration.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
660660
Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
661661
compressed layers. Providing a dataset is required to run scale estimation.
662662
dtype (`str`, *optional*):
663-
Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4'].
663+
Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4'].
664+
Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format.
664665
qptq (`bool`, *optional*):
665666
Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
666667
difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
@@ -845,11 +846,17 @@ def post_init(self):
845846

846847
if self.dtype is None:
847848
self.dtype = "int4" if self.bits == 4 else "int8"
848-
if self.dtype not in ["int4", "int8", "mxfp4", "nf4"]:
849+
if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4"]:
849850
raise ValueError(
850-
f"Weights quantization data type must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.dtype}."
851+
"Weights quantization data type must be one of the following: "
852+
f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4'], but found: {self.dtype}."
851853
)
852-
if self.dtype in ["mxfp4", "nf4"]:
854+
if self.dtype in ["mxfp4", "nf4", "cb4"]:
855+
if self.dtype == "cb4" and is_nncf_version("<=", "2.17"):
856+
raise ImportError(
857+
"Codebook quantization is currently supported only with NNCF develop. "
858+
"Please run `pip install git+https://github.com/openvinotoolkit/nncf.git`."
859+
)
853860
if self.bits != 4:
854861
raise ValueError(
855862
f"When applying weight compression with '{self.dtype}' data type, the `bits` parameter must be set to 4, but found {self.bits}"
@@ -877,6 +884,8 @@ def to_nncf_dict(self) -> Dict[str, Any]:
877884
mode += "_sym" if self.sym else "_asym"
878885
if mode == "mxfp4":
879886
mode = "e2m1"
887+
if mode == "cb4":
888+
mode = "cb4_f8e4m3"
880889
mode = nncf.CompressWeightsMode(mode)
881890

882891
awq = True if self.quant_method == OVQuantizationMethod.AWQ else None
@@ -1243,6 +1252,18 @@ def __init__(
12431252

12441253
self.post_init()
12451254

1255+
def post_init(self):
1256+
super().post_init()
1257+
1258+
if self.weight_quantization_config.dtype == "nf4" and self.full_quantization_config.dtype in [
1259+
"f8e4m3",
1260+
"f8e5m2",
1261+
]:
1262+
logger.warning(
1263+
"\n`nf4_f8e4m3` and `nf4_f8e5m2` mixed precision quantization modes are deprecated and will be "
1264+
"removed in optimum-intel v1.26. Please use `cb4_f8e4m3` instead.\n"
1265+
)
1266+
12461267
@staticmethod
12471268
def _initialize_quantization_config(
12481269
config: Union[dict, OVWeightQuantizationConfig, OVQuantizationConfig],

tests/openvino/test_exporters_cli.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from typing import Dict
1919
from unittest.mock import Mock
2020

21+
import pytest
2122
from parameterized import parameterized
2223
from transformers import AutoModelForCausalLM, AutoModelForZeroShotImageClassification, AutoProcessor, AutoTokenizer
2324
from utils_tests import (
@@ -60,6 +61,7 @@
6061
from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS, TemporaryDirectory
6162
from optimum.intel.utils.import_utils import (
6263
compare_versions,
64+
is_nncf_version,
6365
is_openvino_tokenizers_available,
6466
is_openvino_version,
6567
is_tokenizers_version,
@@ -260,6 +262,18 @@ class OVCLIExportTestCase(unittest.TestCase):
260262
"model": {"f8e4m3": 11, "nf4": 5},
261263
},
262264
),
265+
(
266+
"text-generation",
267+
"llama",
268+
"cb4_f8e4m3",
269+
"--dataset wikitext2 --num-samples 1 --group-size 16 --trust-remote-code --ratio 0.5",
270+
{
271+
"model": 16,
272+
},
273+
{
274+
"model": {"int8": 5, "int4": 5, "f8e4m3": 16},
275+
},
276+
),
263277
(
264278
"text-generation",
265279
"llama",
@@ -467,6 +481,12 @@ class OVCLIExportTestCase(unittest.TestCase):
467481
"nf4",
468482
{"model": {"int8": 4, "nf4": 72}},
469483
),
484+
(
485+
"text-generation-with-past",
486+
"gpt2",
487+
"cb4 --group-size 32",
488+
{"model": {"int8": 24, "int4": 20, "f8e4m3": 20}},
489+
),
470490
(
471491
"text-generation-with-past",
472492
"llama_awq",
@@ -977,6 +997,8 @@ def test_exporters_cli_hybrid_quantization(
977997
def test_exporters_cli_4bit(
978998
self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]]
979999
):
1000+
if option.startswith("cb4") and is_nncf_version("<=", "2.17"):
1001+
pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
9801002
with TemporaryDirectory() as tmpdir:
9811003
result = subprocess.run(
9821004
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -1014,6 +1036,8 @@ def test_exporters_cli_full_quantization(
10141036
expected_fake_nodes_per_model: Dict[str, int],
10151037
expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]],
10161038
):
1039+
if quant_mode == "cb4_f8e4m3" and is_nncf_version("<=", "2.17"):
1040+
pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
10171041
with TemporaryDirectory() as tmpdir:
10181042
subprocess.run(
10191043
f"optimum-cli export openvino --task {task} --model {MODEL_NAMES[model_type]} "

tests/openvino/test_quantization.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
from copy import deepcopy
8181

8282
from optimum.intel.openvino.quantization import InferRequestWrapper, OVCalibrationDatasetBuilder
83-
from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
83+
from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version, is_nncf_version
8484
from utils_tests import (
8585
MODEL_NAMES,
8686
get_num_quantized_nodes,
@@ -158,8 +158,8 @@ class OVQuantizerTest(unittest.TestCase):
158158
(
159159
OVModelForCausalLM,
160160
"llama",
161-
OVMixedQuantizationConfig(
162-
weight_quantization_config=OVWeightQuantizationConfig(
161+
dict(
162+
weight_quantization_config=dict(
163163
bits=4,
164164
dtype="nf4",
165165
group_size=16,
@@ -180,6 +180,31 @@ class OVQuantizerTest(unittest.TestCase):
180180
"model": {"f8e4m3": 8, "nf4": 2},
181181
},
182182
),
183+
(
184+
OVModelForCausalLM,
185+
"llama",
186+
dict(
187+
weight_quantization_config=dict(
188+
bits=4,
189+
dtype="cb4",
190+
group_size=16,
191+
ratio=0.5,
192+
ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.self_attn"]},
193+
),
194+
full_quantization_config=OVQuantizationConfig(
195+
dtype="f8e4m3", ignored_scope={"patterns": [f"{pattern_prefix}.layers.0.mlp"]}
196+
),
197+
ignored_scope={"patterns": [f"{pattern_prefix}.layers.1.self_attn"]},
198+
dataset="wikitext2",
199+
num_samples=1,
200+
),
201+
{
202+
"model": 8,
203+
},
204+
{
205+
"model": {"int8": 2, "int4": 2, "f8e4m3": 10},
206+
},
207+
),
183208
(
184209
OVModelForCausalLM,
185210
"llama",
@@ -597,6 +622,12 @@ def test_ov_model_static_quantization_with_auto_dataset(
597622
expected_fake_nodes_per_model,
598623
expected_num_weight_nodes_per_model,
599624
):
625+
if (
626+
isinstance(quantization_config, dict)
627+
and quantization_config.get("weight_quantization_config", {}).get("dtype") == "cb4"
628+
and is_nncf_version("<=", "2.17")
629+
):
630+
pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
600631
model_id = MODEL_NAMES[model_name]
601632

602633
with TemporaryDirectory() as tmp_dir:
@@ -689,6 +720,13 @@ class OVWeightCompressionTest(unittest.TestCase):
689720
dict(bits=4, dtype="nf4", group_size=32),
690721
{"model": {"int8": 4, "nf4": 20}},
691722
),
723+
(
724+
OVModelForCausalLM,
725+
"gpt2",
726+
False,
727+
dict(bits=4, dtype="cb4", group_size=32),
728+
{"model": {"int8": 24, "int4": 20, "f8e4m3": 20}},
729+
),
692730
(
693731
OVModelForCausalLM,
694732
"gpt2",
@@ -1345,6 +1383,13 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
13451383
def test_ovmodel_4bit_auto_compression_with_config(
13461384
self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes_per_model
13471385
):
1386+
if (
1387+
isinstance(quantization_config, dict)
1388+
and quantization_config.get("dtype") == "cb4"
1389+
and is_nncf_version("<=", "2.17")
1390+
):
1391+
pytest.skip("Codebook quantization is supported starting from NNCF 2.18")
1392+
13481393
model_id = MODEL_NAMES[model_name]
13491394
with TemporaryDirectory() as tmp_dir:
13501395
quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)

0 commit comments

Comments
 (0)