Propagate OV*QuantizationConfig kwargs to nncf calls (#1179)

nikita-savelyevv · AlexKoff88 · web-flow · commit c9ff040327bd · 2025-02-28T16:14:03.000+01:00
* Initial commit

* Add docs

* Remove explicit init_kwargs

* Update docs/source/openvino/optimization.mdx

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;

* init_kwargs -&gt; kwargs

* Fix test after merging

---------

Co-authored-by: Alexander Kozlov &lt;alexander.kozlov@intel.com&gt;
diff --git a/docs/source/openvino/optimization.mdx b/docs/source/openvino/optimization.mdx
@@ -87,6 +87,8 @@ quantization_config = OVWeightQuantizationConfig(
 )
 ```
 
+Note: `OVWeightQuantizationConfig` also accepts keyword arguments that are not listed in its constructor. In this case such arguments will be passed directly to `nncf.compress_weights()` call. This is useful for passing additional parameters to the quantization algorithm.
+
 By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) you can add `sym=True`.
 
 For 4-bit quantization you can also specify the following arguments in the quantization configuration :
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -294,7 +294,7 @@ def __init__(
         dataset: Optional[Union[str, List[str]]] = None,
         tokenizer: Optional[str] = None,
         processor: Optional[str] = None,
-        trust_remote_code: bool = False,
+        trust_remote_code: Optional[bool] = False,
         **kwargs,
     ):
         """
@@ -323,6 +323,7 @@ def __init__(
         if isinstance(ignored_scope, nncf.IgnoredScope):
             ignored_scope = ignored_scope.__dict__
         self.ignored_scope = ignored_scope
+        self.kwargs = kwargs
 
     def post_init(self):
         try:
@@ -342,6 +343,12 @@ def get_ignored_scope_instance(self) -> "nncf.IgnoredScope":
     def clone(self):
         return copy.deepcopy(self)
 
+    def to_dict(self) -> Dict[str, Any]:
+        # Unpack kwargs dict
+        result = super().to_dict()
+        result = result | result.pop("kwargs", {})
+        return result
+
 
 @dataclass
 class OVWeightQuantizationConfig(OVQuantizationConfigBase):
@@ -427,6 +434,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
                 retained in their original precision without any quantization.
             - "int8_sym" stands for 8-bit integer symmetric quantization without zero point.
             - "int8_asym" stands for 8-bit integer asymmetric quantization with zero points per each quantization group.
+        kwargs: Additional parameters for nncf.compress_weights() call.
     """
 
     def __init__(
@@ -451,13 +459,21 @@ def __init__(
         backup_precision: Optional[str] = None,
         **kwargs,
     ):
+        weight_format = kwargs.pop("weight_format", None)
+        if weight_format is not None:
+            logger.warning(
+                "The `weight_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. "
+                "Please use `dtype` instead."
+            )
+            dtype = weight_format
         super().__init__(
             ignored_scope=ignored_scope,
             num_samples=num_samples,
             dataset=dataset,
             tokenizer=tokenizer,
             processor=processor,
             trust_remote_code=trust_remote_code,
+            **kwargs,
         )
         self.bits = bits
         self.sym = sym
@@ -470,12 +486,6 @@ def __init__(
         self.gptq = gptq
         self.lora_correction = lora_correction
         self.backup_precision = backup_precision
-        if kwargs.get("weight_format") is not None:
-            logger.warning(
-                "The `weight_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. "
-                "Please use `dtype` instead."
-            )
-            dtype = kwargs.get("weight_format")
         self.dtype = dtype
         self.post_init()
 
@@ -624,6 +634,7 @@ def to_nncf_dict(self) -> Dict[str, Any]:
             "gptq": self.gptq,
             "lora_correction": self.lora_correction,
             "backup_mode": backup_mode,
+            **self.kwargs,
         }
         return result
 
@@ -712,27 +723,30 @@ def __init__(
                 reduces quantization error.
             dtype (`str`, defaults to "int8"):
                 Data type activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
+            kwargs: Additional parameters for nncf.quantize() call.
         """
+        activation_format = kwargs.pop("activation_format", None)
+        if activation_format is not None:
+            logger.warning(
+                "The `activation_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. "
+                "Please use `dtype` instead."
+            )
+            dtype = activation_format
         super().__init__(
             ignored_scope=ignored_scope,
             num_samples=num_samples,
             dataset=dataset,
             tokenizer=tokenizer,
             processor=processor,
             trust_remote_code=trust_remote_code,
+            **kwargs,
         )
         self.bits = bits
         self.sym = sym
         self.model_type = model_type
         self.fast_bias_correction = fast_bias_correction
         self.overflow_fix = overflow_fix
         self.smooth_quant_alpha = smooth_quant_alpha
-        if kwargs.get("activation_format") is not None:
-            logger.warning(
-                "The `activation_format` parameter is deprecated and will be removed in optimum-intel v1.24.0. "
-                "Please use `dtype` instead."
-            )
-            dtype = kwargs.get("activation_format")
         self.dtype = dtype
 
         f8_dtypes = ["f8e4m3", "f8e5m2"]
@@ -769,23 +783,19 @@ def to_nncf_dict(self) -> Dict[str, Any]:
         Returns a dictionary with the variables that are ready to use for nncf.compress_weights() call.
         """
 
-        preset = "performance" if self.sym else "mixed"
-        advanced_parameters_dict = {"overflow_fix": self.overflow_fix}
+        # Merge advanced parameters from kwargs if they were provided
+        kwargs_copy = copy.deepcopy(self.kwargs)
+        advanced_parameters = kwargs_copy.pop("advanced_parameters", nncf.AdvancedQuantizationParameters())
+        advanced_parameters.overflow_fix = nncf.OverflowFix(self.overflow_fix)
         if self.smooth_quant_alpha:
-            advanced_parameters_dict["smooth_quant_alphas"] = {"matmul": self.smooth_quant_alpha}
+            advanced_parameters.smooth_quant_alphas.matmul = self.smooth_quant_alpha
 
         mode_map = {"f8e4m3": "fp8_e4m3", "f8e5m2": "fp8_e5m2"}
         mode = mode_map.get(self.dtype)
 
+        preset = "performance" if self.sym else "mixed"
         preset = nncf.QuantizationPreset(preset)
         model_type = nncf.ModelType(self.model_type)
-        advanced_parameters = nncf.AdvancedQuantizationParameters(
-            overflow_fix=advanced_parameters_dict["overflow_fix"],
-        )
-        if "smooth_quant_alphas" in advanced_parameters_dict:
-            advanced_parameters.smooth_quant_alphas = nncf.AdvancedSmoothQuantParameters(
-                **advanced_parameters_dict["smooth_quant_alphas"]
-            )
 
         return {
             "mode": mode,
@@ -795,6 +805,7 @@ def to_nncf_dict(self) -> Dict[str, Any]:
             "model_type": model_type,
             "ignored_scope": self.get_ignored_scope_instance(),
             "advanced_parameters": advanced_parameters,
+            **kwargs_copy,
         }
 
 
@@ -965,6 +976,7 @@ def __init__(
             tokenizer=tokenizer,
             processor=processor,
             trust_remote_code=trust_remote_code,
+            **kwargs,
         )
 
         self.post_init()
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -1021,8 +1021,18 @@ def _weight_only_quantization(
         else:
             dataset = nncf.Dataset(calibration_dataset)
 
-    wc_kwargs = copy.deepcopy(kwargs)
-    wc_kwargs.update(config.to_nncf_dict())
+    wc_kwargs = config.to_nncf_dict()
+
+    # Arguments provided in kwargs override the ones from the config
+    kwargs_intersection = set(wc_kwargs.keys()) & set(kwargs.keys())
+    if kwargs_intersection:
+        logger.warning(
+            f"The following nncf.compress_weights() arguments from the OVWeightQuantizationConfig will be overridden "
+            f"by the ones given in _weight_only_quantization call kwargs: {kwargs_intersection}."
+        )
+    wc_kwargs.update(kwargs)
+    wc_kwargs.pop("weight_only", None)
+
     compressed_model = nncf.compress_weights(
         model,
         dataset=dataset,
@@ -1048,8 +1058,19 @@ def _full_quantization(
 
     if verify_not_optimized:
         _verify_not_optimized(model)
-    q_kwargs = copy.deepcopy(kwargs)
-    q_kwargs.update(quantization_config.to_nncf_dict())
+
+    q_kwargs = quantization_config.to_nncf_dict()
+
+    # Arguments provided in kwargs override the ones from the config
+    kwargs_intersection = set(q_kwargs.keys()) & set(kwargs.keys())
+    if kwargs_intersection:
+        logger.warning(
+            f"The following nncf.quantize() arguments from the OVQuantizationConfig will be overridden "
+            f"by the ones given in _full_quantization call kwargs: {kwargs_intersection}."
+        )
+    q_kwargs.update(kwargs)
+    q_kwargs.pop("weight_only", None)
+
     quantized_model = nncf.quantize(model, calibration_dataset=calibration_dataset, **q_kwargs)
 
     _remove_f16_kv_cache_precision_flag(quantized_model)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -11,17 +11,18 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import dataclasses
 import inspect
 
 # ruff: noqa
 
 import itertools
 import logging
 import unittest
-from collections import defaultdict
+from collections import defaultdict, Iterable
 from enum import Enum
 from functools import partial
-from typing import Union
+from typing import Union, Type
 
 import openvino as ov
 import pytest
@@ -77,7 +78,7 @@
 from optimum.intel.openvino.utils import TemporaryDirectory
 from copy import deepcopy
 
-from optimum.intel.openvino.quantization import InferRequestWrapper
+from optimum.intel.openvino.quantization import InferRequestWrapper, _weight_only_quantization, _full_quantization
 from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 from utils_tests import (
     MODEL_NAMES,
@@ -1211,7 +1212,6 @@ class OVQuantizationConfigTest(unittest.TestCase):
             ),
         ),
         (OVQuantizationConfig(ignored_scope=nncf.IgnoredScope(names=["op_name"])),),
-        (OVDynamicQuantizationConfig(bits=8, sym=True),),
     )
 
     QUANTIZATION_CONFIG_DICTS = (
@@ -1276,6 +1276,60 @@ class OVQuantizationConfigTest(unittest.TestCase):
         (dict(bits=8, fast_bias_correction=True, weight_only=False), OVQuantizationConfig, None),
     )
 
+    QUANTIZATION_CONFIGS_WITH_KWARGS = (
+        (
+            OVWeightQuantizationConfig,
+            {
+                "advanced_parameters": nncf.AdvancedCompressionParameters(statistics_path="statistics_path"),
+                "some_arg": "some_value",
+            },
+            {
+                "advanced_parameters": nncf.AdvancedCompressionParameters(statistics_path="statistics_path"),
+                "some_arg": "some_value",
+            },
+        ),
+        (
+            OVQuantizationConfig,
+            {
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(disable_channel_alignment=True),
+                "some_arg": "some_value",
+            },
+            {
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(
+                    overflow_fix=nncf.OverflowFix.DISABLE,
+                    disable_channel_alignment=True,
+                ),
+                "some_arg": "some_value",
+            },
+        ),
+        (
+            OVQuantizationConfig,
+            {
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(overflow_fix=nncf.OverflowFix.ENABLE),
+            },
+            {
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(
+                    overflow_fix=nncf.OverflowFix.DISABLE,
+                ),
+            },
+        ),
+        (
+            OVQuantizationConfig,
+            {
+                "smooth_quant_alpha": 0.5,
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(
+                    smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=0.7, convolution=0.7),
+                ),
+            },
+            {
+                "advanced_parameters": nncf.AdvancedQuantizationParameters(
+                    overflow_fix=nncf.OverflowFix.DISABLE,
+                    smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=0.5, convolution=0.7),
+                ),
+            },
+        ),
+    )
+
     def get_default_configurations() -> dict:
         default_configurations = deepcopy(_DEFAULT_4BIT_CONFIGS)
         default_configurations.update({"default": _DEFAULT_4BIT_CONFIG})
@@ -1327,6 +1381,57 @@ def test_for_no_short_id_duplicates(self):
             assert short_id not in short_ids
             short_ids.add(short_id)
 
+    @parameterized.expand(QUANTIZATION_CONFIGS_WITH_KWARGS)
+    def test_config_init_kwargs(
+        self,
+        config_type: Type[Union[OVWeightQuantizationConfig, OVQuantizationConfig]],
+        config_kwargs: dict,
+        ref_nncf_dict: dict,
+    ):
+        nncf_dict = config_type(**config_kwargs).to_nncf_dict()
+        ref_nncf_dict = config_type().to_nncf_dict() | ref_nncf_dict
+        self.assertTrue(self.compare_objects(nncf_dict, ref_nncf_dict))
+
+    @parameterized.expand(
+        [
+            ("nncf.compress_weights", "_weight_only_quantization", "dataset", OVWeightQuantizationConfig),
+            ("nncf.quantize", "_full_quantization", "calibration_dataset", OVQuantizationConfig),
+        ]
+    )
+    def test_quantization_kwargs_override(self, mock_method_name, quantization_function, dataset_key, config_type):
+        with unittest.mock.patch(mock_method_name) as mock_method:
+            mock_model = unittest.mock.Mock([])
+            mock_model.get_rt_info = unittest.mock.Mock(return_value={})
+
+            mock_quantization_config = unittest.mock.Mock(config_type)
+            mock_quantization_config.to_nncf_dict.return_value = {"param1": "value1", "param2": "value2"}
+
+            additional_kwargs = {"param2": "new_value2", "param3": "value3"}
+
+            quantization_function = globals()[quantization_function]
+            quantization_function(mock_model, mock_quantization_config, None, **additional_kwargs)
+
+            expected_kwargs = {"param1": "value1", "param2": "new_value2", "param3": "value3", dataset_key: None}
+
+            mock_method.assert_called_once_with(mock_model, **expected_kwargs)
+
+    @staticmethod
+    def compare_objects(o1, o2) -> bool:
+        if dataclasses.is_dataclass(o1) and dataclasses.is_dataclass(o2):
+            o1 = o1.__dict__
+            o2 = o2.__dict__
+        if isinstance(o1, dict) and isinstance(o2, dict):
+            for k in set(o1.keys()) | set(o2.keys()):
+                if not OVQuantizationConfigTest.compare_objects(o1[k], o2[k]):
+                    return False
+            return True
+        if isinstance(o1, Iterable) and isinstance(o2, Iterable) and not (isinstance(o1, str) or isinstance(o2, str)):
+            for it1, it2 in zip(o1, o2):
+                if not OVQuantizationConfigTest.compare_objects(it1, it2):
+                    return False
+            return True
+        return o1 == o2
+
 
 class InferRequestWrapperTest(unittest.TestCase):
     MODEL_NAME = ("whisper",)

Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,8 @@ quantization_config = OVWeightQuantizationConfig(`
`87`	`87`	`)`
`88`	`88`	```
`89`	`89`
	`90`	+Note: `OVWeightQuantizationConfig` also accepts keyword arguments that are not listed in its constructor. In this case such arguments will be passed directly to `nncf.compress_weights()` call. This is useful for passing additional parameters to the quantization algorithm.
	`91`	`+`
`90`	`92`	By default the quantization scheme will be [asymmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization), to make it [symmetric](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) you can add `sym=True`.
`91`	`93`
`92`	`94`	`For 4-bit quantization you can also specify the following arguments in the quantization configuration :`