|
49 | 49 | from optimum.exporters.utils import check_dummy_inputs_are_allowed |
50 | 50 | from optimum.intel.openvino.modeling_sam import OVSamPromptEncoder, OVSamVisionEncoder |
51 | 51 | from optimum.quantization_base import OptimumQuantizer |
| 52 | +from optimum.utils.logging import warn_once |
52 | 53 |
|
53 | 54 | from ...exporters.openvino import export, export_pytorch_via_onnx |
54 | 55 | from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer |
|
59 | 60 | _nncf_version, |
60 | 61 | is_datasets_available, |
61 | 62 | is_diffusers_available, |
| 63 | + is_nncf_version, |
62 | 64 | is_sentence_transformers_available, |
63 | 65 | ) |
64 | 66 | from ..utils.modeling_utils import get_model_device |
@@ -693,7 +695,6 @@ def _prepare_visual_causal_lm_calibration_data( |
693 | 695 | Prepares calibration data for VLM pipelines. |
694 | 696 | Currently, collects data only for a language model component. |
695 | 697 | """ |
696 | | - |
697 | 698 | processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code) |
698 | 699 | try: |
699 | 700 | tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code) |
@@ -754,12 +755,51 @@ def _prepare_visual_causal_lm_calibration_data( |
754 | 755 | ) |
755 | 756 |
|
756 | 757 | collected_inputs["lm_model"].append(language_model_inputs) |
| 758 | + |
| 759 | + # If an input dict contains `pixel_values` key and its batch size is greater than 1, we split the data |
| 760 | + # into multiple single-batch dicts below. This lowers peak RAM consumption during quantization calibration. |
| 761 | + for submodel_name in collected_inputs: |
| 762 | + single_batch_collected_inputs = [] |
| 763 | + for input_dict in collected_inputs[submodel_name]: |
| 764 | + # We expect 'pixel_values' to be a 4D tensor: [batch, channel, height, width]. |
| 765 | + # This is standard for batches of images in vision models. |
| 766 | + if ( |
| 767 | + "pixel_values" in input_dict |
| 768 | + and isinstance(input_dict["pixel_values"], torch.Tensor) |
| 769 | + and input_dict["pixel_values"].dim() == 4 |
| 770 | + and input_dict["pixel_values"].shape[0] > 1 |
| 771 | + ): |
| 772 | + if is_nncf_version("<=", "2.18"): |
| 773 | + # TODO (Nikita): Remove once NNCF 2.19 is released. |
| 774 | + warn_once( |
| 775 | + logger, |
| 776 | + "If you are facing RAM OOM issues, please update to the latest NNCF develop version.", |
| 777 | + ) |
| 778 | + batch_size = input_dict["pixel_values"].shape[0] |
| 779 | + for i in range(batch_size): |
| 780 | + single_batch_input_dict = {} |
| 781 | + for input_name, input_value in input_dict.items(): |
| 782 | + if not isinstance(input_value, torch.Tensor): |
| 783 | + raise TypeError( |
| 784 | + f"Expected a torch.Tensor instance for input '{input_name}', " |
| 785 | + f"but got {type(input_value)}." |
| 786 | + ) |
| 787 | + if input_value.shape[0] != batch_size: |
| 788 | + raise ValueError( |
| 789 | + f"Expected a tensor with batch size {batch_size} for input '{input_name}', " |
| 790 | + f"but got shape {input_value.shape}." |
| 791 | + ) |
| 792 | + single_batch_input_dict[input_name] = input_value[i : i + 1] |
| 793 | + single_batch_collected_inputs.append(single_batch_input_dict) |
| 794 | + else: |
| 795 | + single_batch_collected_inputs.append(input_dict) |
| 796 | + collected_inputs[submodel_name] = single_batch_collected_inputs |
757 | 797 | finally: |
758 | 798 | for ov_component in vision_embedding_components: |
759 | 799 | ov_component.request = ov_component.request.request |
760 | 800 |
|
761 | | - for k in collected_inputs: |
762 | | - collected_inputs[k] = nncf.Dataset(collected_inputs[k]) |
| 801 | + for submodel_name in collected_inputs: |
| 802 | + collected_inputs[submodel_name] = nncf.Dataset(collected_inputs[submodel_name]) |
763 | 803 |
|
764 | 804 | return OVCalibrationDataset(collected_inputs) |
765 | 805 |
|
@@ -1298,7 +1338,9 @@ def _quantize_ovbasemodel( |
1298 | 1338 | **kwargs, |
1299 | 1339 | ): |
1300 | 1340 | quantization_config = ov_config.quantization_config |
| 1341 | + dataset_was_built_from_config = False |
1301 | 1342 | if calibration_dataset is None and quantization_config.dataset is not None: |
| 1343 | + dataset_was_built_from_config = True |
1302 | 1344 | calibration_dataset = self.dataset_builder.build_from_quantization_config(quantization_config) |
1303 | 1345 |
|
1304 | 1346 | quantization_configs = {} |
@@ -1353,13 +1395,7 @@ def _quantize_ovbasemodel( |
1353 | 1395 | # |
1354 | 1396 | # Full quantization |
1355 | 1397 | # |
1356 | | - if isinstance(self.model, _OVModelForWhisper): |
1357 | | - for submodel_name in self.model.ov_submodels: |
1358 | | - # quantization_config.num_samples of audio samples result in more actual model inputs |
1359 | | - config = quantization_config.clone() |
1360 | | - config.num_samples = calibration_dataset[submodel_name].get_length() |
1361 | | - quantization_configs[submodel_name] = config |
1362 | | - elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): |
| 1398 | + if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): |
1363 | 1399 | diffusion_model_name = next(iter(calibration_dataset)) |
1364 | 1400 | quantization_configs[diffusion_model_name] = quantization_config |
1365 | 1401 | default_config = OVWeightQuantizationConfig(bits=8) |
@@ -1404,6 +1440,11 @@ def _quantize_ovbasemodel( |
1404 | 1440 | if isinstance(config, OVWeightQuantizationConfig) and config.quant_method == OVQuantizationMethod.HYBRID: |
1405 | 1441 | config = _get_hybrid_mixed_quantization_config(submodel, config, **kwargs) |
1406 | 1442 |
|
| 1443 | + if dataset_was_built_from_config and nncf_dataset is not None and nncf_dataset.get_length() is not None: |
| 1444 | + # For datasets built from the quantization config, override num_samples per submodel |
| 1445 | + config = config.clone() |
| 1446 | + config.num_samples = nncf_dataset.get_length() |
| 1447 | + |
1407 | 1448 | if isinstance(config, OVWeightQuantizationConfig): |
1408 | 1449 | if config.bits == 8: |
1409 | 1450 | # 8-bit weight only data-aware quantization is not supported |
|
0 commit comments