huggingface
diff --git a/‎.github/workflows/test_openvino.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_openvino.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_openvino_full.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_openvino_full.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_openvino_slow.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_openvino_slow.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/openvino/optimization.mdx‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/openvino/optimization.mdx‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎optimum/intel/openvino/configuration.py‎
Lines changed: 72 additions & 7 deletions b/‎optimum/intel/openvino/configuration.py‎
Lines changed: 72 additions & 7 deletions
diff --git a/‎optimum/intel/openvino/modeling_seq2seq.py‎
Lines changed: 20 additions & 2 deletions b/‎optimum/intel/openvino/modeling_seq2seq.py‎
Lines changed: 20 additions & 2 deletions
@@ -43,7 +43,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
+          pip install .[openvino,openvino-tokenizers,diffusers,tests,tests-openvino] transformers[testing]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
         name: Downgrade Transformers and Accelerate
 
@@ -56,7 +56,7 @@ jobs:
           python -m pip install --upgrade pip
           # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[tests]
+          pip install .[tests,tests-openvino]
 
       - name: Install openvino-nightly
         if: ${{ matrix.openvino == 'ov-nightly' }}
 
@@ -42,7 +42,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,tests] transformers[testing]
+          pip install .[openvino,tests,tests-openvino] transformers[testing]
           pip uninstall -y nncf
 
       - if: ${{ matrix.transformers-version != 'latest' }}
 
@@ -166,6 +166,25 @@ calibration_dataset = quantizer.get_calibration_dataset(
 The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.
 
 
+#### Speech-to-text Models Quantization
+
+The speech-to-text Whisper model can be quantized without the need for preparing a custom calibration dataset. Please see example below.
+
+```python
+model_id = "openai/whisper-tiny"
+ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
+    model_id,
+    quantization_config=OVQuantizationConfig(
+        num_samples=10,
+        dataset="librispeech",
+        processor=model_id,
+        matmul_sq_alpha=0.95,
+    )
+)
+```
+
+With this, encoder, decoder and decoder-with-past models of the Whisper pipeline will be fully quantized, including activations.
+
 ###  Hybrid quantization
 
 Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion (SD) models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights.
 
@@ -26,7 +26,7 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
-from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
+from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
 
 
 if is_nncf_available():
@@ -255,6 +255,10 @@ def __init__(
         sym: bool = False,
         ignored_scope: Optional[dict] = None,
         num_samples: Optional[int] = None,
+        dataset: Optional[Optional[Union[str, List[str]]]] = None,
+        tokenizer: Optional[str] = None,
+        processor: Optional[str] = None,
+        trust_remote_code: bool = False,
         **kwargs,
     ):
         """
@@ -272,6 +276,10 @@ def __init__(
         self.bits = bits
         self.sym = sym
         self.num_samples = num_samples
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.trust_remote_code = trust_remote_code
 
         if isinstance(ignored_scope, nncf.IgnoredScope):
             ignored_scope = ignored_scope.__dict__
@@ -313,6 +321,10 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
                     user or organization name, like `dbmdz/bert-base-german-cased`.
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                     using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set
+            for repositories you trust and in which you have read the code, as it will execute on your local machine
+            arbitrary code present in the model repository.
         dataset (`str or List[str]`, *optional*):
             The dataset used for data-aware compression with NNCF.
             - For language models you can provide your own dataset in a list of strings or just use one from the list
@@ -395,10 +407,16 @@ def __init__(
         backup_precision: Optional[str] = None,
         **kwargs,
     ):
-        super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
-        self.tokenizer = tokenizer
-        self.trust_remote_code = trust_remote_code
-        self.dataset = dataset
+        super().__init__(
+            bits=bits,
+            sym=sym,
+            ignored_scope=ignored_scope,
+            num_samples=num_samples,
+            dataset=dataset,
+            tokenizer=tokenizer,
+            processor=processor,
+            trust_remote_code=trust_remote_code,
+        )
         self.group_size = group_size or (-1 if bits == 8 else 128)
         self.ratio = ratio
         self.all_layers = all_layers
@@ -407,7 +425,6 @@ def __init__(
         self.scale_estimation = scale_estimation
         self.weight_format = weight_format
         self.gptq = gptq
-        self.processor = processor
         self.lora_correction = lora_correction
         self.backup_precision = backup_precision
         self.post_init()
@@ -535,6 +552,11 @@ def __init__(
         model_type: str = "transformer",
         fast_bias_correction: bool = True,
         overflow_fix: str = "disable",
+        dataset: Optional[str] = None,
+        tokenizer: Optional[str] = None,
+        processor: Optional[str] = None,
+        trust_remote_code: bool = False,
+        smooth_quant_alpha: Optional[float] = None,
         **kwargs,
     ):
         """
@@ -557,11 +579,42 @@ def __init__(
                 Whether to apply fast or full bias correction algorithm.
             overflow_fix (`str`, default to "disable"):
                 Parameter for controlling overflow fix setting.
+            dataset (`str`, *optional*):
+                The dataset used for quantization. For text-to-speech model quantization the allowed value is 'librispeech'.
+            tokenizer (`str`, *optional*):
+                The tokenizer used to process the dataset. You can pass either:
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                        Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                        user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                        using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+            processor (`str`, *optional*):
+                A transformers processor used to process inputs for multi-modal models. You can pass either:
+                    - A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing files required by the processor, for instance saved
+                        using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
+            trust_remote_code (`bool`, defaults to `False`):
+                Allows to use custom code for the modeling hosted in the model repository. This option should only be set
+                for repositories you trust and in which you have read the code, as it will execute on your local machine
+                arbitrary code present in the model repository.
+            smooth_quant_alpha (`float`, *optional*):
+                SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
+                reduces quantization error.
         """
-        super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
+        super().__init__(
+            bits=bits,
+            sym=sym,
+            ignored_scope=ignored_scope,
+            num_samples=num_samples,
+            dataset=dataset,
+            tokenizer=tokenizer,
+            processor=processor,
+            trust_remote_code=trust_remote_code,
+        )
         self.model_type = model_type
         self.fast_bias_correction = fast_bias_correction
         self.overflow_fix = overflow_fix
+        self.smooth_quant_alpha = smooth_quant_alpha
         self.post_init()
 
     def post_init(self):
@@ -573,6 +626,18 @@ def post_init(self):
         if self.bits != 8:
             raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
 
+        if self.dataset is not None:
+            if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
+                raise ValueError(
+                    f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
+                    f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
+                )
+
+        if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
+            raise ValueError(
+                f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
+            )
+
 
 class OVConfig(BaseConfig):
     CONFIG_NAME = "openvino_config.json"
 
@@ -11,7 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import copy
 import logging
 import os
 from pathlib import Path
@@ -35,7 +35,9 @@
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
 
+from .. import OVConfig, OVQuantizer
 from ..utils import is_transformers_version
+from .configuration import OVQuantizationConfig, OVQuantizationConfigBase
 from .modeling_base_seq2seq import OVBaseModelForSeq2SeqLM
 from .utils import OV_TO_PT_TYPE, _print_compiled_model_properties
 
@@ -973,9 +975,25 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
+        load_in_8bit: bool = False,
+        quantization_config: Union[dict, OVQuantizationConfigBase] = None,
         **kwargs,
     ):
-        return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)
+        compile_only = kwargs.get("compile_only", False)
+
+        if not compile_only and isinstance(quantization_config, OVQuantizationConfig):
+            model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
+                model_id, config, load_in_8bit=False, **kwargs
+            )
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.processor = quantization_config.processor or model_id
+            OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+        else:
+            model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
+                model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs
+            )
+
+        return model
 
     class DummyWhisperModel:
         def __init__(self):