Skip to content

Commit 02835ce

Browse files
[OV] Quantization of Whisper pipeline (#1040)
* Add whisper quantization * Extend quantization to decoder model too * Add documentation * Fix tests * Add quantization on from_pretrained; change test whisper model * Update docs * Update test reference for older transformers version * Address comments * Tweak reference * Tweak test * Tweak reference * Style * Change references * Trigger Tests * Create 'tests-openvino' extra dependency * Style
1 parent 6ea6b5d commit 02835ce

File tree

11 files changed

+378
-88
lines changed

11 files changed

+378
-88
lines changed

.github/workflows/test_openvino.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ jobs:
4343
run: |
4444
pip install --upgrade pip
4545
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
46-
pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
46+
pip install .[openvino,openvino-tokenizers,diffusers,tests,tests-openvino] transformers[testing]
4747
4848
- if: ${{ matrix.transformers-version != 'latest' }}
4949
name: Downgrade Transformers and Accelerate

.github/workflows/test_openvino_full.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
python -m pip install --upgrade pip
5757
# Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
5858
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
59-
pip install .[tests]
59+
pip install .[tests,tests-openvino]
6060
6161
- name: Install openvino-nightly
6262
if: ${{ matrix.openvino == 'ov-nightly' }}

.github/workflows/test_openvino_slow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
run: |
4343
pip install --upgrade pip
4444
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
45-
pip install .[openvino,tests] transformers[testing]
45+
pip install .[openvino,tests,tests-openvino] transformers[testing]
4646
pip uninstall -y nncf
4747
4848
- if: ${{ matrix.transformers-version != 'latest' }}

docs/source/openvino/optimization.mdx

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,25 @@ calibration_dataset = quantizer.get_calibration_dataset(
166166
The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.
167167

168168

169+
#### Speech-to-text Models Quantization
170+
171+
The speech-to-text Whisper model can be quantized without the need for preparing a custom calibration dataset. Please see example below.
172+
173+
```python
174+
model_id = "openai/whisper-tiny"
175+
ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
176+
model_id,
177+
quantization_config=OVQuantizationConfig(
178+
num_samples=10,
179+
dataset="librispeech",
180+
processor=model_id,
181+
matmul_sq_alpha=0.95,
182+
)
183+
)
184+
```
185+
186+
With this, encoder, decoder and decoder-with-past models of the Whisper pipeline will be fully quantized, including activations.
187+
169188
### Hybrid quantization
170189

171190
Traditional optimization methods like post-training 8-bit quantization do not work well for Stable Diffusion (SD) models and can lead to poor generation results. On the other hand, weight compression does not improve performance significantly when applied to Stable Diffusion models, as the size of activations is comparable to weights.

optimum/intel/openvino/configuration.py

Lines changed: 72 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from optimum.configuration_utils import BaseConfig
2727

2828
from ..utils.import_utils import is_nncf_available
29-
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
29+
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
3030

3131

3232
if is_nncf_available():
@@ -255,6 +255,10 @@ def __init__(
255255
sym: bool = False,
256256
ignored_scope: Optional[dict] = None,
257257
num_samples: Optional[int] = None,
258+
dataset: Optional[Optional[Union[str, List[str]]]] = None,
259+
tokenizer: Optional[str] = None,
260+
processor: Optional[str] = None,
261+
trust_remote_code: bool = False,
258262
**kwargs,
259263
):
260264
"""
@@ -272,6 +276,10 @@ def __init__(
272276
self.bits = bits
273277
self.sym = sym
274278
self.num_samples = num_samples
279+
self.dataset = dataset
280+
self.tokenizer = tokenizer
281+
self.processor = processor
282+
self.trust_remote_code = trust_remote_code
275283

276284
if isinstance(ignored_scope, nncf.IgnoredScope):
277285
ignored_scope = ignored_scope.__dict__
@@ -313,6 +321,10 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
313321
user or organization name, like `dbmdz/bert-base-german-cased`.
314322
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
315323
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
324+
trust_remote_code (`bool`, defaults to `False`):
325+
Allows to use custom code for the modeling hosted in the model repository. This option should only be set
326+
for repositories you trust and in which you have read the code, as it will execute on your local machine
327+
arbitrary code present in the model repository.
316328
dataset (`str or List[str]`, *optional*):
317329
The dataset used for data-aware compression with NNCF.
318330
- For language models you can provide your own dataset in a list of strings or just use one from the list
@@ -395,10 +407,16 @@ def __init__(
395407
backup_precision: Optional[str] = None,
396408
**kwargs,
397409
):
398-
super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
399-
self.tokenizer = tokenizer
400-
self.trust_remote_code = trust_remote_code
401-
self.dataset = dataset
410+
super().__init__(
411+
bits=bits,
412+
sym=sym,
413+
ignored_scope=ignored_scope,
414+
num_samples=num_samples,
415+
dataset=dataset,
416+
tokenizer=tokenizer,
417+
processor=processor,
418+
trust_remote_code=trust_remote_code,
419+
)
402420
self.group_size = group_size or (-1 if bits == 8 else 128)
403421
self.ratio = ratio
404422
self.all_layers = all_layers
@@ -407,7 +425,6 @@ def __init__(
407425
self.scale_estimation = scale_estimation
408426
self.weight_format = weight_format
409427
self.gptq = gptq
410-
self.processor = processor
411428
self.lora_correction = lora_correction
412429
self.backup_precision = backup_precision
413430
self.post_init()
@@ -535,6 +552,11 @@ def __init__(
535552
model_type: str = "transformer",
536553
fast_bias_correction: bool = True,
537554
overflow_fix: str = "disable",
555+
dataset: Optional[str] = None,
556+
tokenizer: Optional[str] = None,
557+
processor: Optional[str] = None,
558+
trust_remote_code: bool = False,
559+
smooth_quant_alpha: Optional[float] = None,
538560
**kwargs,
539561
):
540562
"""
@@ -557,11 +579,42 @@ def __init__(
557579
Whether to apply fast or full bias correction algorithm.
558580
overflow_fix (`str`, default to "disable"):
559581
Parameter for controlling overflow fix setting.
582+
dataset (`str`, *optional*):
583+
The dataset used for quantization. For text-to-speech model quantization the allowed value is 'librispeech'.
584+
tokenizer (`str`, *optional*):
585+
The tokenizer used to process the dataset. You can pass either:
586+
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
587+
Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
588+
user or organization name, like `dbmdz/bert-base-german-cased`.
589+
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
590+
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
591+
processor (`str`, *optional*):
592+
A transformers processor used to process inputs for multi-modal models. You can pass either:
593+
- A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
594+
- A path to a *directory* containing files required by the processor, for instance saved
595+
using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
596+
trust_remote_code (`bool`, defaults to `False`):
597+
Allows to use custom code for the modeling hosted in the model repository. This option should only be set
598+
for repositories you trust and in which you have read the code, as it will execute on your local machine
599+
arbitrary code present in the model repository.
600+
smooth_quant_alpha (`float`, *optional*):
601+
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
602+
reduces quantization error.
560603
"""
561-
super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
604+
super().__init__(
605+
bits=bits,
606+
sym=sym,
607+
ignored_scope=ignored_scope,
608+
num_samples=num_samples,
609+
dataset=dataset,
610+
tokenizer=tokenizer,
611+
processor=processor,
612+
trust_remote_code=trust_remote_code,
613+
)
562614
self.model_type = model_type
563615
self.fast_bias_correction = fast_bias_correction
564616
self.overflow_fix = overflow_fix
617+
self.smooth_quant_alpha = smooth_quant_alpha
565618
self.post_init()
566619

567620
def post_init(self):
@@ -573,6 +626,18 @@ def post_init(self):
573626
if self.bits != 8:
574627
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")
575628

629+
if self.dataset is not None:
630+
if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
631+
raise ValueError(
632+
f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
633+
f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
634+
)
635+
636+
if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
637+
raise ValueError(
638+
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
639+
)
640+
576641

577642
class OVConfig(BaseConfig):
578643
CONFIG_NAME = "openvino_config.json"

optimum/intel/openvino/modeling_seq2seq.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
14+
import copy
1515
import logging
1616
import os
1717
from pathlib import Path
@@ -35,7 +35,9 @@
3535
from transformers.generation import GenerationMixin
3636
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
3737

38+
from .. import OVConfig, OVQuantizer
3839
from ..utils import is_transformers_version
40+
from .configuration import OVQuantizationConfig, OVQuantizationConfigBase
3941
from .modeling_base_seq2seq import OVBaseModelForSeq2SeqLM
4042
from .utils import OV_TO_PT_TYPE, _print_compiled_model_properties
4143

@@ -973,9 +975,25 @@ def _from_pretrained(
973975
cls,
974976
model_id: Union[str, Path],
975977
config: "PretrainedConfig",
978+
load_in_8bit: bool = False,
979+
quantization_config: Union[dict, OVQuantizationConfigBase] = None,
976980
**kwargs,
977981
):
978-
return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)
982+
compile_only = kwargs.get("compile_only", False)
983+
984+
if not compile_only and isinstance(quantization_config, OVQuantizationConfig):
985+
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
986+
model_id, config, load_in_8bit=False, **kwargs
987+
)
988+
quantization_config_copy = copy.deepcopy(quantization_config)
989+
quantization_config_copy.processor = quantization_config.processor or model_id
990+
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
991+
else:
992+
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
993+
model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs
994+
)
995+
996+
return model
979997

980998
class DummyWhisperModel:
981999
def __init__(self):

0 commit comments

Comments
 (0)