Skip to content

Commit b7819a9

Browse files
Add PTQ support for OVModelForZeroShotImageClassification (#1283)
* Fix custom dataset processing for text encoding tasks * Update tests * PTQ support for zero-shot image classification task * Update to take into account custom datasets * Add docs * Rely on base from_pretrained method
1 parent 560b980 commit b7819a9

File tree

8 files changed

+300
-48
lines changed

8 files changed

+300
-48
lines changed

docs/source/openvino/optimization.mdx

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,64 @@ Click on a ✅ to copy the command/code for the corresponding optimization case.
383383
</button>
384384
</td>
385385
</tr>
386+
<tr>
387+
<td style="text-align: center; vertical-align: middle;">zero-shot-image-classification<br>(OVModelForZeroShotImageClassification)</td>
388+
<td style="text-align: center; vertical-align: middle;">
389+
<button
390+
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m openai/clip-vit-base-patch16 --weight-format int8 ./save_dir')">
391+
392+
</button>
393+
</td>
394+
<td style="text-align: center; vertical-align: middle;">
395+
<button
396+
onclick="navigator.clipboard.writeText('OVModelForZeroShotImageClassification.from_pretrained(\'openai/clip-vit-base-patch16\', quantization_config=OVWeightQuantizationConfig(bits=8)).save_pretrained(\'save_dir\')')">
397+
398+
</button>
399+
</td>
400+
<td style="text-align: center; vertical-align: middle;">
401+
<button
402+
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m openai/clip-vit-base-patch16 --weight-format int4 --dataset conceptual_captions ./save_dir')">
403+
404+
</button>
405+
</td>
406+
<td style="text-align: center; vertical-align: middle;">
407+
<button
408+
onclick="navigator.clipboard.writeText('OVModelForZeroShotImageClassification.from_pretrained(\'openai/clip-vit-base-patch16\', quantization_config=OVWeightQuantizationConfig(bits=4, dataset=\'conceptual_captions\')).save_pretrained(\'save_dir\')')">
409+
410+
</button>
411+
</td>
412+
<td style="text-align: center; vertical-align: middle;">–</td>
413+
<td style="text-align: center; vertical-align: middle;">
414+
<button
415+
onclick="navigator.clipboard.writeText('OVModelForZeroShotImageClassification.from_pretrained(\'openai/clip-vit-base-patch16\', quantization_config=OVWeightQuantizationConfig(bits=4, quant_method=\'hybrid\', dataset=\'conceptual_captions\')).save_pretrained(\'save_dir\')')">
416+
417+
</button>
418+
</td>
419+
<td style="text-align: center; vertical-align: middle;">
420+
<button
421+
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m openai/clip-vit-base-patch16 --quant-mode int8 --dataset conceptual_captions ./save_dir')">
422+
423+
</button>
424+
</td>
425+
<td style="text-align: center; vertical-align: middle;">
426+
<button
427+
onclick="navigator.clipboard.writeText('OVModelForZeroShotImageClassification.from_pretrained(\'openai/clip-vit-base-patch16\', quantization_config=OVQuantizationConfig(bits=8, dataset=\'conceptual_captions\')).save_pretrained(\'save_dir\')')">
428+
429+
</button>
430+
</td>
431+
<td style="text-align: center; vertical-align: middle;">
432+
<button
433+
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m openai/clip-vit-base-patch16 --quant-mode nf4_f8e4m3 --dataset conceptual_captions ./save_dir')">
434+
435+
</button>
436+
</td>
437+
<td style="text-align: center; vertical-align: middle;">
438+
<button
439+
onclick="navigator.clipboard.writeText('OVModelForZeroShotImageClassification.from_pretrained(\'openai/clip-vit-base-patch16\', quantization_config=OVMixedQuantizationConfig(OVWeightQuantizationConfig(bits=4, dtype=\'nf4\'), OVQuantizationConfig(dtype=\'f8e4m3\', dataset=\'conceptual_captions\'))).save_pretrained(\'save_dir\')')">
440+
441+
</button>
442+
</td>
443+
</tr>
386444
</tbody>
387445
</table>
388446

optimum/commands/export/openvino.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ def run(self):
454454
elif (
455455
quantize_with_dataset
456456
and (
457-
task == "fill-mask"
457+
task in ["fill-mask", "zero-shot-image-classification"]
458458
or task.startswith("text-generation")
459459
or task.startswith("automatic-speech-recognition")
460460
or task.startswith("feature-extraction")
@@ -485,6 +485,10 @@ def run(self):
485485
from ...intel import OVModelForMaskedLM
486486

487487
model_cls = OVModelForMaskedLM
488+
elif task == "zero-shot-image-classification":
489+
from ...intel import OVModelForZeroShotImageClassification
490+
491+
model_cls = OVModelForZeroShotImageClassification
488492
else:
489493
raise NotImplementedError(
490494
f"Unable to find a matching model class for the task={task} and library_name={library_name}."

optimum/intel/openvino/modeling.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
import logging
1615
import os
1716
from pathlib import Path

optimum/intel/openvino/modeling_base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ def _from_pretrained(
424424
quantizer = OVQuantizer(model)
425425
quantization_config_copy = copy.deepcopy(quantization_config)
426426
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
427+
quantization_config_copy.processor = quantization_config.processor or model_id
427428
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
428429

429430
return model

optimum/intel/openvino/quantization.py

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import os
1919
from collections import UserDict, deque
2020
from contextlib import contextmanager
21+
from io import BytesIO
2122
from pathlib import Path
2223
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
2324

@@ -73,6 +74,7 @@
7374
PREDEFINED_LANGUAGE_DATASETS,
7475
PREDEFINED_SD_DATASETS,
7576
PREDEFINED_SPEECH_TO_TEXT_DATASETS,
77+
PREDEFINED_TEXT_IMAGE_ENCODER_DATASETS,
7678
PREDEFINED_VISUAL_LM_DATASETS,
7779
)
7880

@@ -268,6 +270,7 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
268270
OVModelForFeatureExtraction,
269271
OVModelForMaskedLM,
270272
OVModelForVisualCausalLM,
273+
OVModelForZeroShotImageClassification,
271274
OVSentenceTransformer,
272275
)
273276
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper
@@ -280,7 +283,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
280283

281284
if isinstance(self.model, OVModelForCausalLM):
282285
return self._prepare_causal_lm_calibration_data(config)
283-
elif isinstance(self.model, (OVModelForVisualCausalLM, _OVModelForWhisper)):
286+
elif isinstance(
287+
self.model, (OVModelForVisualCausalLM, _OVModelForWhisper, OVModelForZeroShotImageClassification)
288+
):
284289
if config.processor is None:
285290
raise ValueError(
286291
"`processor` must be specified in order to run data-aware quantization. Please provide it as a"
@@ -307,6 +312,16 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
307312
trust_remote_code=config.trust_remote_code,
308313
streaming=dataset_metadata["streaming"],
309314
)
315+
elif isinstance(self.model, OVModelForZeroShotImageClassification):
316+
dataset_metadata = PREDEFINED_TEXT_IMAGE_ENCODER_DATASETS[config.dataset]
317+
return self.build_from_dataset_name(
318+
config,
319+
dataset_metadata["id"],
320+
num_samples=None,
321+
dataset_split=dataset_metadata["split"],
322+
trust_remote_code=config.trust_remote_code,
323+
streaming=dataset_metadata["streaming"],
324+
)
310325
else:
311326
raise Exception
312327
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
@@ -330,13 +345,14 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
330345
return self.build_from_dataset(config, dataset)
331346
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
332347
if isinstance(config.dataset, str):
348+
dataset_metadata = PREDEFINED_LANGUAGE_DATASETS[config.dataset]
333349
dataset = self.load_dataset(
334-
PREDEFINED_LANGUAGE_DATASETS[config.dataset]["path"],
350+
dataset_metadata["id"],
335351
num_samples=None,
336-
dataset_config_name=PREDEFINED_LANGUAGE_DATASETS[config.dataset]["name"],
337-
dataset_split=PREDEFINED_LANGUAGE_DATASETS[config.dataset]["split"],
352+
dataset_config_name=dataset_metadata["name"],
353+
dataset_split=dataset_metadata["split"],
338354
trust_remote_code=config.trust_remote_code,
339-
streaming=PREDEFINED_LANGUAGE_DATASETS[config.dataset]["streaming"],
355+
streaming=dataset_metadata["streaming"],
340356
)
341357
elif isinstance(config.dataset, list) and all(isinstance(it, str) for it in config.dataset):
342358
dataset = datasets.Dataset.from_list([{"text": it} for it in config.dataset])
@@ -345,6 +361,8 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
345361
"Please provide dataset as one of the accepted dataset labels or as a list of strings."
346362
)
347363
return self.build_from_dataset(config, dataset)
364+
else:
365+
raise RuntimeError("Unsupported model type for calibration dataset collection.")
348366

349367
def build_from_dataset_name(
350368
self,
@@ -449,6 +467,7 @@ def build_from_dataset(
449467
OVModelForFeatureExtraction,
450468
OVModelForMaskedLM,
451469
OVModelForVisualCausalLM,
470+
OVModelForZeroShotImageClassification,
452471
OVSentenceTransformer,
453472
)
454473
from optimum.intel.openvino.modeling_decoder import OVBaseDecoderModel
@@ -470,6 +489,7 @@ def build_from_dataset(
470489
_OVModelForWhisper,
471490
OVModelForFeatureExtraction,
472491
OVModelForMaskedLM,
492+
OVModelForZeroShotImageClassification,
473493
OVSentenceTransformer,
474494
),
475495
) or (is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)):
@@ -487,6 +507,8 @@ def build_from_dataset(
487507
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
488508
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
489509
return self._prepare_text_encoder_model_calibration_data(quantization_config, dataset)
510+
elif isinstance(self.model, OVModelForZeroShotImageClassification):
511+
return self._prepare_text_image_encoder_model_calibration_data(quantization_config, dataset)
490512
else:
491513
raise RuntimeError("Unsupported model type for calibration dataset collection.")
492514
else:
@@ -878,6 +900,74 @@ def get_tokenizer():
878900

879901
return OVCalibrationDataset({"model": nncf.Dataset(calibration_data)})
880902

903+
def _prepare_text_image_encoder_model_calibration_data(
904+
self,
905+
quantization_config: OVQuantizationConfigBase,
906+
dataset: "Dataset",
907+
seq_len: int = 128,
908+
) -> OVCalibrationDataset:
909+
self.model.compile()
910+
911+
def get_processor():
912+
processor = AutoProcessor.from_pretrained(
913+
quantization_config.processor, trust_remote_code=quantization_config.trust_remote_code
914+
)
915+
return processor
916+
917+
max_position_embeddings = getattr(self.model.config, "max_position_embeddings", None)
918+
if max_position_embeddings is not None and max_position_embeddings > 0:
919+
seq_len = min(seq_len, max_position_embeddings)
920+
921+
num_samples = quantization_config.num_samples or 128
922+
calibration_data = []
923+
try:
924+
inference_result_mock = {
925+
"logits_per_image": np.empty((1,), np.float32),
926+
"logits_per_text": np.empty((1,), np.float32),
927+
"text_embeds": np.empty((1,), np.float32),
928+
"image_embeds": np.empty((1,), np.float32),
929+
}
930+
931+
self.model.request = InferRequestWrapper(
932+
self.model.request,
933+
calibration_data,
934+
inference_result_mock=inference_result_mock,
935+
)
936+
937+
processor = None
938+
pbar = tqdm(total=num_samples, desc="Collecting calibration data")
939+
for item in dataset:
940+
if "input_ids" in item:
941+
# Assuming that dataset contains already preprocessed text
942+
inputs = self._wrap_sample_as_array(item, add_batch_dim=True)
943+
else:
944+
dataset_metadata = PREDEFINED_TEXT_IMAGE_ENCODER_DATASETS[quantization_config.dataset]
945+
try:
946+
response = requests.get(item[dataset_metadata["image_column_name"]], timeout=5)
947+
response.raise_for_status()
948+
image = Image.open(BytesIO(response.content))
949+
except Exception:
950+
continue
951+
processor = processor or get_processor()
952+
inputs = processor(
953+
text=item[dataset_metadata["text_column_name"]],
954+
images=image.convert("RGB"),
955+
return_tensors="pt",
956+
padding=True,
957+
)
958+
if inputs["input_ids"].shape[1] > seq_len:
959+
inputs["input_ids"] = inputs["input_ids"][:, :seq_len]
960+
961+
self.model(**inputs)
962+
963+
pbar.update(min(num_samples, len(calibration_data)) - pbar.n)
964+
if len(calibration_data) >= num_samples:
965+
break
966+
finally:
967+
self.model.request = self.model.request.request
968+
969+
return OVCalibrationDataset({"model": nncf.Dataset(calibration_data)})
970+
881971
@staticmethod
882972
def _wrap_sample_as_array(
883973
sample: Dict[str, Any], add_batch_dim: bool = False

optimum/intel/openvino/utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@
152152
PREDEFINED_CAUSAL_LANGUAGE_DATASETS = {"wikitext2", "c4", "c4-new", "auto"}
153153

154154
PREDEFINED_LANGUAGE_DATASETS = {
155-
"wikitext2": {"path": "wikitext", "name": "wikitext-2-raw-v1", "split": "train", "streaming": False},
156-
"c4": {"path": "allenai/c4", "name": "en", "split": "train", "streaming": True},
155+
"wikitext2": {"id": "wikitext", "name": "wikitext-2-raw-v1", "split": "train", "streaming": False},
156+
"c4": {"id": "allenai/c4", "name": "en", "split": "train", "streaming": True},
157157
}
158158

159159
PREDEFINED_SD_DATASETS = {
@@ -166,6 +166,16 @@
166166
"laion/filtered-wit": {"split": "train", "prompt_column_name": "caption", "streaming": True},
167167
}
168168

169+
PREDEFINED_TEXT_IMAGE_ENCODER_DATASETS = {
170+
"conceptual_captions": {
171+
"id": "conceptual_captions",
172+
"split": "train",
173+
"text_column_name": "caption",
174+
"image_column_name": "image_url",
175+
"streaming": True,
176+
},
177+
}
178+
169179
PREDEFINED_VISUAL_LM_DATASETS = {
170180
"contextual": {
171181
"id": "ucla-contextual/contextual_test",

tests/openvino/test_exporters_cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,18 @@ class OVCLIExportTestCase(unittest.TestCase):
383383
{"int8": 16},
384384
],
385385
),
386+
(
387+
"zero-shot-image-classification",
388+
"clip",
389+
"int8",
390+
"--dataset conceptual_captions --num-samples 1",
391+
[
392+
65,
393+
],
394+
[
395+
{"int8": 65},
396+
],
397+
),
386398
]
387399

388400
TEST_4BIT_CONFIGURATIONS = [

0 commit comments

Comments
 (0)