Skip to content

Commit db52d1f

Browse files
VLM Vision Encoder full quantization (#1394)
* VE quantization experiments * WIP * Add a script for evaluation on MME * Changes to mme_eval.py * Added automatic quantization of vlm * Style * Added test * Remove helper files * Fix * Apply quantization of VE if general quantization config is selected * Fix * Fix in other places * Another fix * Add cli test * Add condition on transformers version * Make method private * Add default_config argument
1 parent 7594c3c commit db52d1f

File tree

10 files changed

+169
-84
lines changed

10 files changed

+169
-84
lines changed

optimum/intel/openvino/configuration.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,7 @@ class OVPipelineQuantizationConfig(OVQuantizationConfigBase):
12771277
def __init__(
12781278
self,
12791279
quantization_configs: Dict[str, Union[Dict, OVQuantizationConfigBase]],
1280+
default_config: Optional[Union[Dict, OVQuantizationConfigBase]] = None,
12801281
num_samples: Optional[int] = None,
12811282
dataset: Optional[Union[str, List[str]]] = None,
12821283
tokenizer: Optional[str] = None,
@@ -1293,6 +1294,9 @@ def __init__(
12931294
quantization_configs (Dict[str, Union[Dict, OVQuantizationConfigBase]]):
12941295
A dictionary where keys are submodel names and values are either dictionaries or instances of
12951296
`OVQuantizationConfigBase` containing quantization configurations for each submodel in the pipeline.
1297+
default_config (Optional[Union[Dict, OVQuantizationConfigBase]]):
1298+
A default quantization configuration that will be applied to all submodels that do not have a
1299+
specific configuration provided in `quantization_configs`.
12961300
num_samples (Optional[int]):
12971301
The maximum number of samples composing the calibration dataset. Defaults to None.
12981302
dataset (Optional[Union[str, List[str]]]):
@@ -1323,6 +1327,8 @@ def or_op(a, b):
13231327
for submodel_name, submodel_config in quantization_configs.items():
13241328
if isinstance(submodel_config, dict):
13251329
quantization_configs[submodel_name] = _quantization_config_from_dict(submodel_config)
1330+
if default_config is not None and isinstance(default_config, dict):
1331+
default_config = _quantization_config_from_dict(default_config)
13261332

13271333
# Pull dataset-related parameters from child configs
13281334
configs = quantization_configs.values()
@@ -1342,6 +1348,7 @@ def or_op(a, b):
13421348
**kwargs,
13431349
)
13441350
self.quantization_configs = quantization_configs
1351+
self.default_config = default_config
13451352
self.post_init()
13461353

13471354
def to_dict(self) -> Dict[str, Any]:

optimum/intel/openvino/modeling_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,8 @@ def _from_pretrained(
429429

430430
quantizer = OVQuantizer(model)
431431
quantization_config_copy = copy.deepcopy(quantization_config)
432-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
433-
quantization_config_copy.processor = quantization_config.processor or model_id
432+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
433+
quantization_config_copy.processor = str(quantization_config.processor or model_id)
434434
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
435435

436436
return model

optimum/intel/openvino/modeling_decoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ def _from_pretrained(
937937

938938
quantizer = OVQuantizer(causal_model)
939939
quantization_config_copy = copy.deepcopy(quantization_config)
940-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
940+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
941941
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
942942

943943
return causal_model

optimum/intel/openvino/modeling_sam.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,8 @@ def _from_pretrained(
299299

300300
quantizer = OVQuantizer(model)
301301
quantization_config_copy = quantization_config.clone()
302-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
303-
quantization_config_copy.processor = quantization_config.processor or model_id
302+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
303+
quantization_config_copy.processor = str(quantization_config.processor or model_id)
304304
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
305305

306306
return model

optimum/intel/openvino/modeling_seq2seq.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,8 +569,8 @@ def _from_pretrained(
569569

570570
quantizer = OVQuantizer(model)
571571
quantization_config_copy = quantization_config.clone()
572-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
573-
quantization_config_copy.processor = quantization_config.processor or model_id
572+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
573+
quantization_config_copy.processor = str(quantization_config.processor or model_id)
574574
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
575575

576576
return model

optimum/intel/openvino/modeling_text2speech.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _from_pretrained(
423423
from optimum.intel.openvino.quantization import OVQuantizer
424424

425425
quantization_config_copy = copy.deepcopy(quantization_config)
426-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
426+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
427427
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
428428

429429
return model

optimum/intel/openvino/modeling_visual_language.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -622,9 +622,9 @@ def _from_pretrained(
622622
from optimum.intel.openvino.quantization import OVQuantizer
623623

624624
quantization_config_copy = copy.deepcopy(quantization_config)
625-
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
625+
quantization_config_copy.tokenizer = str(quantization_config.tokenizer or model_id)
626626
potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
627-
quantization_config_copy.processor = quantization_config.processor or potential_processor_id
627+
quantization_config_copy.processor = str(quantization_config.processor or potential_processor_id)
628628
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
629629

630630
return model

optimum/intel/openvino/quantization.py

Lines changed: 93 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,8 @@ def _prepare_visual_causal_lm_calibration_data(
695695
Prepares calibration data for VLM pipelines.
696696
Currently, collects data only for a language model component.
697697
"""
698+
from optimum.intel.openvino.modeling_visual_language import OVVisionEmbedding
699+
698700
processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
699701
try:
700702
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
@@ -704,43 +706,65 @@ def _prepare_visual_causal_lm_calibration_data(
704706

705707
dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[config.dataset]
706708

707-
calibration_data = []
708-
num_samples = config.num_samples or 32
709-
for item in tqdm(dataset, desc="Collecting calibration dataset", total=num_samples):
710-
if len(calibration_data) > num_samples:
711-
break
712-
713-
instruction = item[dataset_metadata["inputs"]["instruction"]]
714-
image_url = item[dataset_metadata["inputs"]["image_url"]]
715-
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
716-
if max_image_size is not None:
717-
# To avoid large images, resize them keeping the aspect ratio
718-
scale_factor = max(image.size[0] / max_image_size, image.size[1] / max_image_size)
719-
if scale_factor > 1:
720-
new_size = (int(image.size[0] / scale_factor), int(image.size[1] / scale_factor))
721-
image = image.resize(new_size)
722-
723-
try:
724-
inputs = self.model.preprocess_inputs(
725-
text=instruction, image=image, processor=processor, tokenizer=tokenizer, config=self.model.config
709+
collected_inputs: Dict[str, List[Dict[str, Any]]] = {"lm_model": []}
710+
# Collect vision embeddings calibration data by using InferRequestWrapper
711+
vision_embedding_components = []
712+
for ov_component_name, ov_component in self.model.components.items():
713+
if not isinstance(ov_component, OVVisionEmbedding):
714+
continue
715+
vision_embedding_components.append(ov_component)
716+
submodel_name = f"{ov_component_name}_model"
717+
collected_inputs[submodel_name] = []
718+
ov_component._compile()
719+
ov_component.request = InferRequestWrapper(ov_component.request, collected_inputs[submodel_name])
720+
721+
try:
722+
num_samples = config.num_samples or 32
723+
for item in tqdm(dataset, desc="Collecting calibration dataset", total=num_samples):
724+
if len(collected_inputs["lm_model"]) >= num_samples:
725+
break
726+
727+
instruction = item[dataset_metadata["inputs"]["instruction"]]
728+
image_url = item[dataset_metadata["inputs"]["image_url"]]
729+
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
730+
if max_image_size is not None:
731+
# To avoid large images, resize them keeping the aspect ratio
732+
scale_factor = max(image.size[0] / max_image_size, image.size[1] / max_image_size)
733+
if scale_factor > 1:
734+
new_size = (int(image.size[0] / scale_factor), int(image.size[1] / scale_factor))
735+
image = image.resize(new_size)
736+
737+
try:
738+
inputs = self.model.preprocess_inputs(
739+
text=instruction,
740+
image=image,
741+
processor=processor,
742+
tokenizer=tokenizer,
743+
config=self.model.config,
744+
)
745+
except ValueError as value_error:
746+
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
747+
raise tokenizer_error
748+
raise value_error
749+
750+
inputs_embeds, attention_mask, position_ids = self.model.get_multimodal_embeddings(**inputs)
751+
752+
language_model_inputs = self.model.language_model.prepare_inputs(
753+
input_ids=None,
754+
attention_mask=attention_mask,
755+
position_ids=position_ids,
756+
inputs_embeds=inputs_embeds,
726757
)
727-
except ValueError as value_error:
728-
if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
729-
raise tokenizer_error
730-
raise value_error
731-
732-
inputs_embeds, attention_mask, position_ids = self.model.get_multimodal_embeddings(**inputs)
733-
734-
language_model_inputs = self.model.language_model.prepare_inputs(
735-
input_ids=None,
736-
attention_mask=attention_mask,
737-
position_ids=position_ids,
738-
inputs_embeds=inputs_embeds,
739-
)
740758

741-
calibration_data.append(language_model_inputs)
759+
collected_inputs["lm_model"].append(language_model_inputs)
760+
finally:
761+
for ov_component in vision_embedding_components:
762+
ov_component.request = ov_component.request.request
763+
764+
for k in collected_inputs:
765+
collected_inputs[k] = nncf.Dataset(collected_inputs[k])
742766

743-
return OVCalibrationDataset({"lm_model": nncf.Dataset(calibration_data)})
767+
return OVCalibrationDataset(collected_inputs)
744768

745769
def _prepare_speech_to_text_calibration_data(
746770
self, config: OVQuantizationConfigBase, dataset: "Dataset"
@@ -1285,7 +1309,7 @@ def _quantize_ovbasemodel(
12851309
**kwargs,
12861310
):
12871311
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper
1288-
from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM
1312+
from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM, OVVisionEmbedding
12891313

12901314
if is_diffusers_available():
12911315
from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
@@ -1295,25 +1319,19 @@ def _quantize_ovbasemodel(
12951319
calibration_dataset = self.dataset_builder.build_from_quantization_config(quantization_config)
12961320

12971321
quantization_configs = {}
1298-
if isinstance(quantization_config, OVPipelineQuantizationConfig):
1299-
quantization_configs = quantization_config.quantization_configs
1300-
elif (
1322+
default_config = None
1323+
if (
13011324
isinstance(quantization_config, OVWeightQuantizationConfig)
13021325
and quantization_config.quant_method != OVQuantizationMethod.HYBRID
13031326
):
13041327
#
13051328
# Regular (non-hybrid) weight-only quantization
13061329
#
13071330
if isinstance(self.model, OVModelForVisualCausalLM):
1308-
for submodel_name in self.model.ov_submodels:
1309-
quantization_configs[submodel_name] = (
1310-
quantization_config
1311-
if submodel_name == "lm_model"
1312-
else OVWeightQuantizationConfig(bits=8, sym=True)
1313-
)
1331+
quantization_configs["lm_model"] = quantization_config
1332+
default_config = OVWeightQuantizationConfig(bits=8, sym=True)
13141333
else:
1315-
for submodel_name in self.model.ov_submodels:
1316-
quantization_configs[submodel_name] = quantization_config
1334+
default_config = quantization_config
13171335
else:
13181336
#
13191337
# Hybrid/Full/Mixed quantization
@@ -1344,9 +1362,7 @@ def _quantize_ovbasemodel(
13441362
quantization_config_copy = quantization_config.clone()
13451363
quantization_config_copy.dataset = None
13461364
quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT
1347-
for submodel_name in self.model.ov_submodels:
1348-
if submodel_name != diffusion_model_name:
1349-
quantization_configs[submodel_name] = quantization_config_copy
1365+
default_config = quantization_config_copy
13501366
else:
13511367
# The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
13521368
quantization_configs["model"] = quantization_config
@@ -1363,44 +1379,52 @@ def _quantize_ovbasemodel(
13631379
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
13641380
diffusion_model_name = next(iter(calibration_dataset))
13651381
quantization_configs[diffusion_model_name] = quantization_config
1366-
for submodel_name in self.model.ov_submodels:
1367-
if submodel_name != diffusion_model_name:
1368-
quantization_configs[submodel_name] = OVWeightQuantizationConfig(bits=8)
1382+
default_config = OVWeightQuantizationConfig(bits=8)
13691383
elif isinstance(self.model, OVModelForVisualCausalLM):
1370-
for submodel_name in self.model.ov_submodels:
1371-
quantization_configs[submodel_name] = (
1372-
quantization_config
1373-
if submodel_name == "lm_model"
1374-
else OVWeightQuantizationConfig(bits=8, sym=True)
1375-
)
1376-
else:
1377-
for submodel_name in self.model.ov_submodels:
1384+
quantization_configs["lm_model"] = quantization_config
1385+
vision_embedding_submodel_names = [
1386+
f"{name}_model"
1387+
for name, component in self.model.components.items()
1388+
if isinstance(component, OVVisionEmbedding)
1389+
]
1390+
for submodel_name in vision_embedding_submodel_names:
13781391
quantization_configs[submodel_name] = quantization_config
1392+
default_config = OVWeightQuantizationConfig(bits=8, sym=True)
1393+
else:
1394+
default_config = quantization_config
13791395
elif isinstance(quantization_config, OVMixedQuantizationConfig):
13801396
#
13811397
# Mixed quantization
13821398
#
13831399
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
13841400
raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
13851401

1386-
for submodel_name in self.model.ov_submodels:
1387-
quantization_configs[submodel_name] = quantization_config
1388-
else:
1402+
default_config = quantization_config
1403+
elif not isinstance(quantization_config, OVPipelineQuantizationConfig):
13891404
raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")
13901405

1391-
for submodel_name, config in quantization_configs.items():
1392-
if submodel_name not in self.model.ov_submodels:
1393-
raise RuntimeError(
1394-
f"Unexpected submodel name encountered during applying quantization: {submodel_name}. "
1395-
f"Available submodels: {list(self.model.ov_submodels.keys())}."
1396-
)
1406+
pipeline_quantization_config = (
1407+
quantization_config
1408+
if isinstance(quantization_config, OVPipelineQuantizationConfig)
1409+
else OVPipelineQuantizationConfig(quantization_configs, default_config=default_config)
1410+
)
1411+
1412+
for submodel_name in self.model.ov_submodels:
1413+
config = pipeline_quantization_config.quantization_configs.get(
1414+
submodel_name, pipeline_quantization_config.default_config
1415+
)
1416+
if config is None:
1417+
continue
13971418
submodel = self.model.ov_submodels[submodel_name]
13981419
nncf_dataset = calibration_dataset.get(submodel_name, None) if calibration_dataset else None
13991420

14001421
if isinstance(config, OVWeightQuantizationConfig) and config.quant_method == OVQuantizationMethod.HYBRID:
14011422
config = _get_hybrid_mixed_quantization_config(submodel, config, **kwargs)
14021423

14031424
if isinstance(config, OVWeightQuantizationConfig):
1425+
if config.bits == 8:
1426+
# 8-bit weight only data-aware quantization is not supported
1427+
nncf_dataset = None
14041428
# Weight only quantization is performed in-place
14051429
_weight_only_quantization(submodel, config, nncf_dataset, **kwargs)
14061430
elif isinstance(config, (OVQuantizationConfig, OVMixedQuantizationConfig)):

tests/openvino/test_exporters_cli.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,28 @@ class OVCLIExportTestCase(unittest.TestCase):
469469
),
470470
]
471471

472+
if is_transformers_version(">=", "4.45.0"):
473+
SUPPORTED_QUANTIZATION_ARCHITECTURES.extend(
474+
[
475+
(
476+
"image-text-to-text",
477+
"internvl2",
478+
"f8e4m3",
479+
"--dataset contextual --num-samples 1 --trust-remote-code",
480+
{
481+
"lm_model": 15,
482+
"text_embeddings_model": 0,
483+
"vision_embeddings_model": 17,
484+
},
485+
{
486+
"lm_model": {"f8e4m3": 15},
487+
"text_embeddings_model": {"int8": 1},
488+
"vision_embeddings_model": {"f8e4m3": 11},
489+
},
490+
),
491+
]
492+
)
493+
472494
TEST_4BIT_CONFIGURATIONS = [
473495
(
474496
"text-generation-with-past",
@@ -1082,7 +1104,7 @@ def test_exporters_cli_full_quantization(
10821104
if "--library sentence_transformers" in option
10831105
else eval(_HEAD_TO_AUTOMODELS[task])
10841106
)
1085-
model = model_cls.from_pretrained(tmpdir)
1107+
model = model_cls.from_pretrained(tmpdir, trust_remote_code="--trust-remote-code" in option)
10861108

10871109
if (
10881110
"automatic-speech-recognition" in task or "text2text-generation" in task

0 commit comments

Comments
 (0)