huggingface
diff --git a/‎docs/source/openvino/export.mdx‎
Lines changed: 29 additions & 16 deletions b/‎docs/source/openvino/export.mdx‎
Lines changed: 29 additions & 16 deletions
diff --git a/‎docs/source/openvino/optimization.mdx‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/openvino/optimization.mdx‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎optimum/commands/export/openvino.py‎
Lines changed: 35 additions & 5 deletions b/‎optimum/commands/export/openvino.py‎
Lines changed: 35 additions & 5 deletions
@@ -29,13 +29,14 @@ optimum-cli export openvino --model local_llama --task text-generation-with-past
 
 Check out the help for more options:
 
-```bash
+```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
                                    [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
-                                   [--group-size GROUP_SIZE] [--dataset DATASET] [--all-layers] [--awq]
-                                   [--scale-estimation] [--gptq] [--sensitivity-metric SENSITIVITY_METRIC]
+                                   [--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
+                                   [--dataset DATASET] [--all-layers] [--awq] [--scale-estimation] [--gptq]
+                                   [--lora-correction] [--sensitivity-metric SENSITIVITY_METRIC]
                                    [--num-samples NUM_SAMPLES] [--disable-stateful] [--disable-convert-tokenizer]
                                    output
 
@@ -49,15 +50,15 @@ Required arguments:
 
 Optional arguments:
   --task TASK           The task to export the model for. If not specified, the task will be auto-inferred based on
-                        the model. Available tasks depend on the model, but are among: ['fill-mask', 'masked-im',
-                        'audio-classification', 'automatic-speech-recognition', 'text-to-audio', 'image-text-to-text',
-                        'depth-estimation', 'image-to-image', 'text-generation', 'text-to-image', 'mask-generation',
-                        'audio-frame-classification', 'sentence-similarity', 'image-classification', 'multiple-
-                        choice', 'text-classification', 'text2text-generation', 'token-classification', 'feature-
-                        extraction', 'zero-shot-image-classification', 'zero-shot-object-detection', 'object-
-                        detection', 'inpainting', 'question-answering', 'semantic-segmentation', 'image-segmentation',
-                        'audio-xvector', 'image-to-text']. For decoder models, use `xxx-with-past` to export the model
-                        using past key values in the decoder.
+                        the model. Available tasks depend on the model, but are among: ['image-to-image',
+                        'image-segmentation', 'inpainting', 'sentence-similarity', 'text-to-audio', 'image-to-text',
+                        'automatic-speech-recognition', 'token-classification', 'text-to-image', 'audio-classification',
+                        'feature-extraction', 'semantic-segmentation', 'masked-im', 'audio-xvector',
+                        'audio-frame-classification', 'text2text-generation', 'multiple-choice', 'depth-estimation',
+                        'image-classification', 'fill-mask', 'zero-shot-object-detection', 'object-detection',
+                        'question-answering', 'zero-shot-image-classification', 'mask-generation', 'text-generation',
+                        'text-classification']. For decoder models, use 'xxx-with-past' to export the model using past
+                        key values in the decoder.
   --framework {pt,tf}   The framework to use for the export. If not provided, will attempt to use the local
                         checkpoint's original framework or what is available in the environment.
   --trust-remote-code   Allows to use custom code for the modeling hosted in the model repository. This option should
@@ -82,10 +83,18 @@ Optional arguments:
   --group-size GROUP_SIZE
                         The group size to use for quantization. Recommended value is 128 and -1 uses per-column
                         quantization.
-  --dataset DATASET     The dataset used for data-aware compression or quantization with NNCF. You can use the one
-                        from the list ['wikitext2','c4','c4-new'] for language models or
-                        ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for
-                        diffusion models.
+  --backup-precision {none,int8_sym,int8_asym}
+                        Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight
+                        format. If not provided, backup precision is int8_asym. 'none' stands for original floating-
+                        point precision of the model weights, in this case weights are retained in their original
+                        precision without any quantization. 'int8_sym' stands for 8-bit integer symmetric quantization
+                        without zero point. 'int8_asym' stands for 8-bit integer asymmetric quantization with zero
+                        points per each quantization group.
+  --dataset DATASET     The dataset used for data-aware compression or quantization with NNCF. For language models you
+                        can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
+                        be collected from model's generations. For diffusion models it should be on of
+                        ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
+                        visual language models the dataset must be set to 'contextual'.
   --all-layers          Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
@@ -98,6 +107,10 @@ Optional arguments:
   --gptq                Indicates whether to apply GPTQ algorithm that optimizes compressed weights in a layer-wise
                         fashion to minimize the difference between activations of a compressed and original layer.
                         Please note, that applying GPTQ takes additional memory and time.
+  --lora-correction     Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces
+                        low-rank adaptation layers in the model that can recover accuracy after weight compression at
+                        some cost of inference latency. Please note, that applying LoRA Correction algorithm takes
+                        additional memory and time.
   --sensitivity-metric SENSITIVITY_METRIC
                         The sensitivity metric for assigning quantization precision to layers. It can be one of the
                         following: ['weight_quantization_error', 'hessian_input_activation',
 
@@ -101,6 +101,7 @@ Quality of 4-bit weight compressed model can further be improved by employing on
 * **AWQ** which stands for Activation Aware Quantization is an algorithm that tunes model weights for more accurate 4-bit compression. It slightly improves generation quality of compressed LLMs, but requires significant additional time and memory for tuning weights on a calibration dataset. Please note that it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped.
 * **Scale Estimation** is a method that tunes quantization scales to minimize the `L2` error between the original and compressed layers. Providing a dataset is required to run scale estimation. Using this method also incurs additional time and memory overhead.
 * **GPTQ** optimizes compressed weights in a layer-wise fashion to minimize the difference between activations of a compressed and original layer.
+* **LoRA Correction** mitigates quantization noise introduced during weight compression by leveraging low-rank adaptation.
 
 Data-aware algorithms can be applied together or separately. For that, provide corresponding arguments to the 4-bit `OVWeightQuantizationConfig` together with a dataset. For example:
 ```python
@@ -115,6 +116,8 @@ quantization_config = OVWeightQuantizationConfig(
 )
 ```
 
+Note: GPTQ and LoRA Correction algorithms can't be applied simultaneously.
+
 ### Static quantization
 
 When applying post-training static quantization, both the weights and the activations are quantized.
 
@@ -117,14 +117,30 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."),
     )
+    optional_group.add_argument(
+        "--backup-precision",
+        type=str,
+        choices=["none", "int8_sym", "int8_asym"],
+        default=None,
+        help=(
+            "Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight format. "
+            "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of "
+            "the model weights, in this case weights are retained in their original precision without any "
+            "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' "
+            "stands for 8-bit integer asymmetric quantization with zero points per each quantization group."
+        ),
+    )
     optional_group.add_argument(
         "--dataset",
         type=str,
         default=None,
         help=(
             "The dataset used for data-aware compression or quantization with NNCF. "
-            "You can use the one from the list ['wikitext2','c4','c4-new'] for language models "
-            "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
+            "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the "
+            "dataset will be collected from model's generations. "
+            "For diffusion models it should be on of ['conceptual_captions',"
+            "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
+            "For visual language models the dataset must be set to 'contextual'."
         ),
     )
     optional_group.add_argument(
@@ -143,7 +159,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
         help=(
             "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
             "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset "
-            "argument. Note: it's possible that there will be no matching patterns in the model to apply AWQ, in such "
+            "argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such "
             "case it will be skipped."
         ),
     )
@@ -167,12 +183,22 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "applying GPTQ takes additional memory and time."
         ),
     )
+    optional_group.add_argument(
+        "--lora-correction",
+        action="store_true",
+        default=None,
+        help=(
+            "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank "
+            "adaptation layers in the model that can recover accuracy after weight compression at some cost of "
+            "inference latency. Please note, that applying LoRA Correction algorithm takes additional memory and time."
+        ),
+    )
     optional_group.add_argument(
         "--sensitivity-metric",
         type=str,
         default=None,
         help=(
-            "The sensitivity metric for assigning quantization precision to layers. Can be one of the following: "
+            "The sensitivity metric for assigning quantization precision to layers. It can be one of the following: "
             "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
             "'max_activation_variance', 'mean_activation_magnitude']."
         ),
@@ -191,7 +217,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. "
             "If --disable-stateful option is used, it may result in sub-optimal inference performance. "
             "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing "
-            "OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
+            "OpenVINO native inference code that expects KV-cache inputs and outputs in the model."
         ),
     )
     optional_group.add_argument(
@@ -215,7 +241,9 @@ def no_compression_parameter_provided(args):
                 args.awq,
                 args.scale_estimation,
                 args.gptq,
+                args.lora_correction,
                 args.sensitivity_metric,
+                args.backup_precision,
             )
         )
     )
@@ -287,7 +315,9 @@ def run(self):
                     "sensitivity_metric": self.args.sensitivity_metric,
                     "scale_estimation": self.args.scale_estimation,
                     "gptq": self.args.gptq,
+                    "lora_correction": self.args.lora_correction,
                     "weight_format": self.args.weight_format,
+                    "backup_precision": self.args.backup_precision,
                 }
 
             if quantization_config.get("dataset", None) is not None: