Add quantization statistics path argument (#1392)

nikita-savelyevv · mvafin · commit 1ac67bdef4e9 · 2025-08-12T16:54:18.000+02:00
* Add quantization statistics path argument

* Add note

* Hande additional cases
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -38,6 +38,7 @@ usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,
                                    [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
                                    [--dataset DATASET] [--all-layers] [--awq] [--scale-estimation] [--gptq]
                                    [--lora-correction] [--sensitivity-metric SENSITIVITY_METRIC]
+                                   [--quantization-statistics-path QUANTIZATION_STATISTICS_PATH]
                                    [--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer]
                                    [--smooth-quant-alpha SMOOTH_QUANT_ALPHA]
                                    output
@@ -136,6 +137,12 @@ Optional arguments:
                         The sensitivity metric for assigning quantization precision to layers. It can be one of the
                         following: ['weight_quantization_error', 'hessian_input_activation',
                         'mean_activation_variance', 'max_activation_variance', 'mean_activation_magnitude'].
+  --quantization-statistics-path QUANTIZATION_STATISTICS_PATH
+                        Directory path to dump/load data-aware weight-only quantization statistics. This is useful when
+                        running data-aware quantization multiple times on the same model and dataset to avoid
+                        recomputing statistics. This option is applicable exclusively for weight-only quantization.
+                        Please note that the statistics depend on the dataset, so if you change the dataset, you should
+                        also change the statistics path to avoid confusion.
   --num-samples NUM_SAMPLES
                         The maximum number of samples to take from the dataset for quantization.
   --disable-stateful    Disable stateful converted models, stateless models will be generated instead. Stateful models
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -239,6 +239,17 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "'max_activation_variance', 'mean_activation_magnitude']."
         ),
     )
+    optional_group.add_argument(
+        "--quantization-statistics-path",
+        type=str,
+        default=None,
+        help=(
+            "Directory path to dump/load data-aware weight-only quantization statistics. This is useful when running "
+            "data-aware quantization multiple times on the same model and dataset to avoid recomputing statistics. "
+            "This option is applicable exclusively for weight-only quantization. Please note that the statistics depend "
+            "on the dataset, so if you change the dataset, you should also change the statistics path to avoid confusion."
+        ),
+    )
     optional_group.add_argument(
         "--num-samples",
         type=int,
@@ -278,6 +289,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
 
 
 def no_compression_parameter_provided(args):
+    # Except statistics path
     return all(
         (
             it is None
@@ -354,7 +366,7 @@ def run(self):
 
         if self.args.weight_format is None and self.args.quant_mode is None:
             ov_config = None
-            if not no_compression_parameter_provided(self.args):
+            if not no_compression_parameter_provided(self.args) or self.args.quantization_statistics_path is not None:
                 raise ValueError(
                     "Some compression parameters are provided, but the weight format is not specified. "
                     "Please provide it with --weight-format argument."
@@ -384,6 +396,8 @@ def run(self):
                     else:
                         quantization_config = _DEFAULT_4BIT_WQ_CONFIG
                         log_message = f"Applying a default quantization config: {quantization_config}."
+                    if self.args.quantization_statistics_path is not None:
+                        quantization_config["statistics_path"] = self.args.quantization_statistics_path
                     logger.info(log_message)
                 else:
                     quantization_config = prepare_wc_config(self.args, _DEFAULT_4BIT_WQ_CONFIG)
@@ -422,6 +436,11 @@ def run(self):
                             "dataset": self.args.dataset,
                         }
                     else:
+                        if self.args.quantization_statistics_path is not None:
+                            logger.warning(
+                                "The --quantization-statistics-path argument is only applicable for weight-only "
+                                "quantization. It will be ignored."
+                            )
                         quantization_config = prepare_q_config(self.args)
             quantization_config["trust_remote_code"] = self.args.trust_remote_code
             ov_config = OVConfig(quantization_config=quantization_config)
@@ -590,6 +609,7 @@ def prepare_wc_config(args, default_configs):
         "lora_correction": args.lora_correction,
         "dtype": args.weight_format,
         "backup_precision": args.backup_precision,
+        "statistics_path": args.quantization_statistics_path,
     }
 
 
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -682,6 +682,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
                 retained in their original precision without any quantization.
             - "int8_sym" stands for 8-bit integer symmetric quantization without zero point.
             - "int8_asym" stands for 8-bit integer asymmetric quantization with zero points per each quantization group.
+        statistics_path (`str`, *optional*):
+            Directory path to dump/load data-aware statistics. This is useful when running data-aware quantization
+            multiple times on the same model and dataset to avoid recomputing statistics.
+            Please note that the statistics depend on the dataset, so if you change the dataset, you should also change
+            the statistics path to avoid confusion.
         kwargs: Additional parameters for nncf.compress_weights() call.
     """
 
@@ -705,6 +710,7 @@ def __init__(
         processor: Optional[str] = None,
         lora_correction: bool = None,
         backup_precision: Optional[str] = None,
+        statistics_path: Optional[str] = None,
         **kwargs,
     ):
         weight_format = kwargs.pop("weight_format", None)
@@ -735,6 +741,7 @@ def __init__(
         self.lora_correction = lora_correction
         self.backup_precision = backup_precision
         self.dtype = dtype
+        self.statistics_path = statistics_path
         self.post_init()
 
     def post_init(self):
@@ -891,6 +898,11 @@ def to_nncf_dict(self) -> Dict[str, Any]:
         awq = True if self.quant_method == OVQuantizationMethod.AWQ else None
         sensitivity_metric = nncf.SensitivityMetric(self.sensitivity_metric) if self.sensitivity_metric else None
         backup_mode = nncf.BackupMode(self.backup_precision) if self.backup_precision else None
+        kwargs = self.kwargs.copy()
+        if self.statistics_path:
+            advanced_parameters = kwargs.get("advanced_parameters", nncf.AdvancedCompressionParameters())
+            advanced_parameters = dataclasses.replace(advanced_parameters, statistics_path=self.statistics_path)
+            kwargs["advanced_parameters"] = advanced_parameters
         result = {
             "mode": mode,
             "ratio": self.ratio,
@@ -904,7 +916,7 @@ def to_nncf_dict(self) -> Dict[str, Any]:
             "gptq": self.gptq,
             "lora_correction": self.lora_correction,
             "backup_mode": backup_mode,
-            **self.kwargs,
+            **kwargs,
         }
         return result
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import copy
+import dataclasses
 import inspect
 import logging
 import os
@@ -1674,6 +1675,11 @@ def _weight_only_quantization(
     wc_kwargs.update(kwargs)
     wc_kwargs.pop("weight_only", None)
 
+    advanced_parameters = wc_kwargs.get("advanced_parameters")
+    if advanced_parameters is not None and advanced_parameters.statistics_path is not None and dataset is None:
+        # Graceful handling of unnecessary statistics_path
+        wc_kwargs["advanced_parameters"] = dataclasses.replace(advanced_parameters, statistics_path=None)
+
     compressed_model = nncf.compress_weights(
         model,
         dataset=dataset,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -1026,6 +1026,25 @@ def test_exporters_cli_4bit(
                 "--lora-correction" not in option or b"with correction of low-rank adapters" in result.stdout
             )
 
+    def test_exporters_cli_4bit_with_statistics_path(self):
+        with TemporaryDirectory() as tmpdir:
+            statistics_path = f"{tmpdir}/statistics"
+            result = subprocess.run(
+                f"optimum-cli export openvino --model {MODEL_NAMES['llama']} --weight-format int4 --awq "
+                f"--dataset wikitext2 --group-size 4 --quantization-statistics-path {statistics_path} {tmpdir}",
+                shell=True,
+                check=True,
+                capture_output=True,
+            )
+            self.assertTrue(
+                b"Statistics were successfully saved to a directory " + bytes(statistics_path, "utf-8")
+                in result.stdout
+            )
+            self.assertTrue(
+                b"Statistics were successfully loaded from a directory " + bytes(statistics_path, "utf-8")
+                in result.stdout
+            )
+
     @parameterized.expand(SUPPORTED_QUANTIZATION_ARCHITECTURES)
     def test_exporters_cli_full_quantization(
         self,
@@ -1069,7 +1088,7 @@ def test_exporters_cli_full_quantization(
         [
             (
                 "falcon-40b",
-                "tiiuae/falcon-7b-instruct",
+                "bigscience/bloomz-560m",
                 AutoModelForCausalLM,
                 OVModelForCausalLM,
                 "--task text-generation-with-past --weight-format int4",
@@ -1112,16 +1131,20 @@ def test_exporters_cli_with_default_config(
             with open(Path(tmpdir) / "config.json", "w") as wf:
                 json.dump(config, wf)
 
+            is_weight_compression = "--weight-format" in options
+            run_command = f"optimum-cli export openvino --model {tmpdir} {options} {tmpdir}"
+            if is_weight_compression:
+                # Providing quantization statistics path should not interfere with the default configuration matching
+                run_command += f" --quantization-statistics-path {tmpdir}/statistics"
             subprocess.run(
-                f"optimum-cli export openvino --model {tmpdir} {options} {tmpdir}",
+                run_command,
                 shell=True,
                 check=True,
             )
 
             model = ov_model_cls.from_pretrained(tmpdir)
             rt_info = model.model.get_rt_info()
             nncf_info = rt_info["nncf"]
-            is_weight_compression = "weight_compression" in nncf_info
             model_quantization_config = nncf_info["weight_compression" if is_weight_compression else "quantization"]
 
             default_config = {**default_configs_collection[model_id]}
@@ -1134,6 +1157,10 @@ def test_exporters_cli_with_default_config(
                 quant_method = default_config.pop("quant_method", None)
                 default_config["awq"] = quant_method == "awq"
                 default_config["gptq"] = quant_method == "gptq"
+                advanced_parameters = eval(model_quantization_config["advanced_parameters"].value)
+                model_quantization_config["statistics_path"] = Mock()
+                model_quantization_config["statistics_path"].value = advanced_parameters["statistics_path"]
+                default_config["statistics_path"] = f"{tmpdir}/statistics"
             else:
                 dtype = default_config.pop("dtype", None)
                 self.assertEqual(dtype, "int8")
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -2114,6 +2114,25 @@ class OVQuantizationConfigTest(unittest.TestCase):
                 "some_arg": "some_value",
             },
         ),
+        (
+            OVWeightQuantizationConfig,
+            {
+                "advanced_parameters": nncf.AdvancedCompressionParameters(statistics_path="statistics_path"),
+                "statistics_path": "statistics_path2",
+            },
+            {
+                "advanced_parameters": nncf.AdvancedCompressionParameters(statistics_path="statistics_path2"),
+            },
+        ),
+        (
+            OVWeightQuantizationConfig,
+            {
+                "statistics_path": "statistics_path",
+            },
+            {
+                "advanced_parameters": nncf.AdvancedCompressionParameters(statistics_path="statistics_path"),
+            },
+        ),
         (
             OVQuantizationConfig,
             {