Skip to content

Commit bc13ae5

Browse files
[OV] Fix high memory consumption during vision encoder quantization (#1440)
* WIP * Clean-up * Set 128 samples by default * Bring batching logic * Update num_samples dynamically * Remove temporary code * Undo vision_embedding_crop_size * Rename variable * Apply changes suggested by Copilot * Update pipeline test * Add nncf installation warning * Apply another Copilot suggestion * Apply copilot suggestions * Apply suggested changes
1 parent 939918f commit bc13ae5

File tree

2 files changed

+64
-11
lines changed

2 files changed

+64
-11
lines changed

optimum/intel/openvino/quantization.py

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from optimum.exporters.utils import check_dummy_inputs_are_allowed
5050
from optimum.intel.openvino.modeling_sam import OVSamPromptEncoder, OVSamVisionEncoder
5151
from optimum.quantization_base import OptimumQuantizer
52+
from optimum.utils.logging import warn_once
5253

5354
from ...exporters.openvino import export, export_pytorch_via_onnx
5455
from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer
@@ -59,6 +60,7 @@
5960
_nncf_version,
6061
is_datasets_available,
6162
is_diffusers_available,
63+
is_nncf_version,
6264
is_sentence_transformers_available,
6365
)
6466
from ..utils.modeling_utils import get_model_device
@@ -693,7 +695,6 @@ def _prepare_visual_causal_lm_calibration_data(
693695
Prepares calibration data for VLM pipelines.
694696
Currently, collects data only for a language model component.
695697
"""
696-
697698
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
698699
try:
699700
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
@@ -754,12 +755,51 @@ def _prepare_visual_causal_lm_calibration_data(
754755
)
755756

756757
collected_inputs["lm_model"].append(language_model_inputs)
758+
759+
# If an input dict contains `pixel_values` key and its batch size is greater than 1, we split the data
760+
# into multiple single-batch dicts below. This lowers peak RAM consumption during quantization calibration.
761+
for submodel_name in collected_inputs:
762+
single_batch_collected_inputs = []
763+
for input_dict in collected_inputs[submodel_name]:
764+
# We expect 'pixel_values' to be a 4D tensor: [batch, channel, height, width].
765+
# This is standard for batches of images in vision models.
766+
if (
767+
"pixel_values" in input_dict
768+
and isinstance(input_dict["pixel_values"], torch.Tensor)
769+
and input_dict["pixel_values"].dim() == 4
770+
and input_dict["pixel_values"].shape[0] > 1
771+
):
772+
if is_nncf_version("<=", "2.18"):
773+
# TODO (Nikita): Remove once NNCF 2.19 is released.
774+
warn_once(
775+
logger,
776+
"If you are facing RAM OOM issues, please update to the latest NNCF develop version.",
777+
)
778+
batch_size = input_dict["pixel_values"].shape[0]
779+
for i in range(batch_size):
780+
single_batch_input_dict = {}
781+
for input_name, input_value in input_dict.items():
782+
if not isinstance(input_value, torch.Tensor):
783+
raise TypeError(
784+
f"Expected a torch.Tensor instance for input '{input_name}', "
785+
f"but got {type(input_value)}."
786+
)
787+
if input_value.shape[0] != batch_size:
788+
raise ValueError(
789+
f"Expected a tensor with batch size {batch_size} for input '{input_name}', "
790+
f"but got shape {input_value.shape}."
791+
)
792+
single_batch_input_dict[input_name] = input_value[i : i + 1]
793+
single_batch_collected_inputs.append(single_batch_input_dict)
794+
else:
795+
single_batch_collected_inputs.append(input_dict)
796+
collected_inputs[submodel_name] = single_batch_collected_inputs
757797
finally:
758798
for ov_component in vision_embedding_components:
759799
ov_component.request = ov_component.request.request
760800

761-
for k in collected_inputs:
762-
collected_inputs[k] = nncf.Dataset(collected_inputs[k])
801+
for submodel_name in collected_inputs:
802+
collected_inputs[submodel_name] = nncf.Dataset(collected_inputs[submodel_name])
763803

764804
return OVCalibrationDataset(collected_inputs)
765805

@@ -1298,7 +1338,9 @@ def _quantize_ovbasemodel(
12981338
**kwargs,
12991339
):
13001340
quantization_config = ov_config.quantization_config
1341+
dataset_was_built_from_config = False
13011342
if calibration_dataset is None and quantization_config.dataset is not None:
1343+
dataset_was_built_from_config = True
13021344
calibration_dataset = self.dataset_builder.build_from_quantization_config(quantization_config)
13031345

13041346
quantization_configs = {}
@@ -1353,13 +1395,7 @@ def _quantize_ovbasemodel(
13531395
#
13541396
# Full quantization
13551397
#
1356-
if isinstance(self.model, _OVModelForWhisper):
1357-
for submodel_name in self.model.ov_submodels:
1358-
# quantization_config.num_samples of audio samples result in more actual model inputs
1359-
config = quantization_config.clone()
1360-
config.num_samples = calibration_dataset[submodel_name].get_length()
1361-
quantization_configs[submodel_name] = config
1362-
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
1398+
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
13631399
diffusion_model_name = next(iter(calibration_dataset))
13641400
quantization_configs[diffusion_model_name] = quantization_config
13651401
default_config = OVWeightQuantizationConfig(bits=8)
@@ -1404,6 +1440,11 @@ def _quantize_ovbasemodel(
14041440
if isinstance(config, OVWeightQuantizationConfig) and config.quant_method == OVQuantizationMethod.HYBRID:
14051441
config = _get_hybrid_mixed_quantization_config(submodel, config, **kwargs)
14061442

1443+
if dataset_was_built_from_config and nncf_dataset is not None and nncf_dataset.get_length() is not None:
1444+
# For datasets built from the quantization config, override num_samples per submodel
1445+
config = config.clone()
1446+
config.num_samples = nncf_dataset.get_length()
1447+
14071448
if isinstance(config, OVWeightQuantizationConfig):
14081449
if config.bits == 8:
14091450
# 8-bit weight only data-aware quantization is not supported

tests/openvino/test_quantization.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,9 @@ def eval_expression_if_possible(expression):
18561856
q_rt_info = rt_info["nncf"][rt_info_key]
18571857
config_dict = sub_config.to_nncf_dict()
18581858
for param_name in q_rt_info:
1859+
if sub_config.num_samples is None and param_name == "subset_size":
1860+
# Skip subset_size check because num_samples was not explicitly provided
1861+
continue
18591862
rt_info_value = q_rt_info[param_name]
18601863
if isinstance(rt_info_value, dict):
18611864
# For example, ignored scope case
@@ -1891,7 +1894,16 @@ def eval_expression_if_possible(expression):
18911894

18921895
if config_value is None and rt_info_value is False:
18931896
continue
1894-
self.assertEqual(config_value, rt_info_value, f"Mismatch in {param_name} for {submodel_name}")
1897+
if param_name == "subset_size":
1898+
self.assertGreaterEqual(
1899+
rt_info_value,
1900+
config_value,
1901+
f"Actual subset size should not be less than the requested one.",
1902+
)
1903+
else:
1904+
self.assertEqual(
1905+
config_value, rt_info_value, f"Mismatch in {param_name} for {submodel_name}"
1906+
)
18951907

18961908

18971909
class OVQuantizerQATest(unittest.TestCase):

0 commit comments

Comments
 (0)