Add openvino Zero-shot-Image-Classification support (#1273)

eaidova · web-flow · commit 264205b4f2f3 · 2025-04-30T11:14:54.000+02:00
* support Zero-shot-Image-Classification

* add tests

* update supported models

* skip siglip for old transformers

* apply review comments
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -119,6 +119,7 @@ Here is the list of the supported architectures :
 - SEW
 - SEW-D
 - Segformer
+- SigLIP
 - SmolVLM(SmolVLM2)
 - SpeechT5 (text-to-speech)
 - SqueezeBert
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -191,6 +191,7 @@
             "OVModelOpenCLIPVisual",
             "OVModelOpenCLIPText",
             "OVModelOpenCLIPForZeroShotImageClassification",
+            "OVModelForZeroShotImageClassification",
             "OVSamModel",
         ]
     )
@@ -356,6 +357,7 @@
             OVModelForTokenClassification,
             OVModelForVision2Seq,
             OVModelForVisualCausalLM,
+            OVModelForZeroShotImageClassification,
             OVModelOpenCLIPForZeroShotImageClassification,
             OVModelOpenCLIPText,
             OVModelOpenCLIPVisual,
diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
@@ -69,6 +69,7 @@
     OVModelForQuestionAnswering,
     OVModelForSequenceClassification,
     OVModelForTokenClassification,
+    OVModelForZeroShotImageClassification,
 )
 from .modeling_decoder import OVModelForCausalLM
 from .modeling_open_clip import (
diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
@@ -35,6 +35,7 @@
     AutoModelForQuestionAnswering,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
+    AutoModelForZeroShotImageClassification,
     PretrainedConfig,
 )
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
@@ -49,6 +50,7 @@
     TokenClassifierOutput,
     XVectorOutput,
 )
+from transformers.models.clip.modeling_clip import CLIPOutput
 
 from ..utils.import_utils import is_timm_available, is_timm_version
 from .modeling_base import OVBaseModel
@@ -952,3 +954,45 @@ def forward(self, **kwargs):
                 model_outputs[key_name] = torch.from_numpy(value).to(self.device) if not np_inputs else value
 
         return ModelOutput(**model_outputs)
+
+
+class OVModelForZeroShotImageClassification(OVModel):
+    auto_model_class = AutoModelForZeroShotImageClassification
+    export_feature = "zero-shot-image-classification"
+
+    def forward(self, input_ids, pixel_values, attention_mask: Optional[torch.Tensor] = None, **kwargs):
+        self.compile()
+
+        np_inputs = isinstance(input_ids, np.ndarray)
+        if not np_inputs:
+            input_ids = input_ids.cpu().numpy()
+            pixel_values = pixel_values.cpu().numpy()
+            attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
+        inputs = {"input_ids": input_ids, "pixel_values": pixel_values}
+        # Add the attention_mask when needed
+        if "attention_mask" in self.input_names:
+            inputs["attention_mask"] = attention_mask if attention_mask is not None else np.ones_like(input_ids)
+        outputs = self._inference(inputs)
+        logits_per_image = (
+            torch.from_numpy(outputs["logits_per_image"]).to(self.device)
+            if not np_inputs
+            else outputs["logits_per_image"]
+        )
+        logits_per_text = (
+            torch.from_numpy(outputs["logits_per_text"]).to(self.device)
+            if not np_inputs
+            else outputs["logits_per_text"]
+        )
+        text_embeds = (
+            torch.from_numpy(outputs["text_embeds"]).to(self.device) if not np_inputs else outputs["text_embeds"]
+        )
+        image_embeds = (
+            torch.from_numpy(outputs["image_embeds"]).to(self.device) if not np_inputs else outputs["image_embeds"]
+        )
+
+        return CLIPOutput(
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+        )
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
@@ -129,6 +129,7 @@
     "question-answering": "OVModelForQuestionAnswering",
     "image-classification": "OVModelForImageClassification",
     "image-text-to-text": "OVModelForVisualCausalLM",
+    "zero-shot-image-classification": "OVModelForZeroShotImageClassification",
     "audio-classification": "OVModelForAudioClassification",
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
diff --git a/optimum/intel/utils/dummy_openvino_objects.py b/optimum/intel/utils/dummy_openvino_objects.py
@@ -59,6 +59,17 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["openvino"])
 
 
+class OVModelForZeroShotImageClassification(metaclass=DummyObject):
+    _backends = ["openvino"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino"])
+
+
 class OVModelForAudioFrameClassification(metaclass=DummyObject):
     _backends = ["openvino"]
 
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
@@ -44,6 +44,7 @@
     OVModelForTextToSpeechSeq2Seq,
     OVModelForTokenClassification,
     OVModelForVisualCausalLM,
+    OVModelForZeroShotImageClassification,
     OVSamModel,
     OVStableDiffusion3Pipeline,
     OVStableDiffusionPipeline,
@@ -78,6 +79,7 @@ class ExportModelTest(unittest.TestCase):
         "llava": OVModelForVisualCausalLM,
         "sam": OVSamModel,
         "speecht5": OVModelForTextToSpeechSeq2Seq,
+        "clip": OVModelForZeroShotImageClassification,
     }
 
     EXPECTED_DIFFUSERS_SCALE_FACTORS = {
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -45,6 +45,7 @@
     OVModelForTextToSpeechSeq2Seq,
     OVModelForTokenClassification,
     OVModelForVisualCausalLM,
+    OVModelForZeroShotImageClassification,
     OVModelOpenCLIPForZeroShotImageClassification,
     OVModelOpenCLIPText,
     OVModelOpenCLIPVisual,
@@ -87,6 +88,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("image-to-image", "stable-diffusion-xl-refiner"),
         ("feature-extraction", "sam"),
         ("text-to-audio", "speecht5"),
+        ("zero-shot-image-classification", "clip"),
     ]
 
     if is_transformers_version(">=", "4.45"):
@@ -119,6 +121,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "ltx-video": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0,
         "sam": 0,  # no tokenizer
         "speecht5": 2,
+        "clip": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0,
     }
 
     TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS = {
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -53,6 +53,7 @@
     AutoModelForSpeechSeq2Seq,
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
+    AutoModelForZeroShotImageClassification,
     AutoProcessor,
     AutoTokenizer,
     GenerationConfig,
@@ -95,6 +96,7 @@
     OVModelForTokenClassification,
     OVModelForVision2Seq,
     OVModelForVisualCausalLM,
+    OVModelForZeroShotImageClassification,
     OVModelOpenCLIPForZeroShotImageClassification,
     OVSamModel,
     OVSentenceTransformer,
@@ -2817,7 +2819,7 @@ def test_pipeline(self, model_arch: str):
         ov_model.reshape(1, -1)
         ov_model.compile()
 
-        # Speech recogition generation
+        # Image caption generation
         pipe = pipeline(
             "image-to-text",
             model=ov_model,
@@ -3295,5 +3297,56 @@ def test_compare_to_transformers(self, model_arch):
         del vocoder
         del model
         del processor
+        gc.collect()
+
 
+class OVModelForZeroShotImageClassificationIntegrationTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ["clip"]
+    if is_transformers_version(">=", "4.45"):
+        SUPPORTED_ARCHITECTURES.append("siglip")
+    TASK = "zero-shot-image-classification"
+    IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForZeroShotImageClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        processor = get_preprocessor(model_id)
+
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+
+        IMAGE = Image.open(
+            requests.get(
+                self.IMAGE_URL,
+                stream=True,
+            ).raw
+        ).convert("RGB")
+        labels = ["a photo of a cat", "a photo of a dog"]
+        inputs = processor(images=IMAGE, text=labels, return_tensors="pt")
+
+        transformers_model = AutoModelForZeroShotImageClassification.from_pretrained(model_id)
+
+        # test end-to-end inference
+        ov_outputs = ov_model(**inputs)
+
+        self.assertTrue("logits_per_image" in ov_outputs)
+        self.assertIsInstance(ov_outputs.logits_per_image, torch.Tensor)
+        self.assertTrue("logits_per_text" in ov_outputs)
+        self.assertIsInstance(ov_outputs.logits_per_text, torch.Tensor)
+        self.assertTrue("text_embeds" in ov_outputs)
+        self.assertIsInstance(ov_outputs.text_embeds, torch.Tensor)
+        self.assertTrue("image_embeds" in ov_outputs)
+        self.assertIsInstance(ov_outputs.image_embeds, torch.Tensor)
+
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**inputs)
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(ov_outputs.logits_per_image, transformers_outputs.logits_per_image, atol=1e-4))
+        self.assertTrue(torch.allclose(ov_outputs.logits_per_text, transformers_outputs.logits_per_text, atol=1e-4))
+        self.assertTrue(torch.allclose(ov_outputs.text_embeds, transformers_outputs.text_embeds, atol=1e-4))
+        self.assertTrue(torch.allclose(ov_outputs.image_embeds, transformers_outputs.image_embeds, atol=1e-4))
+
+        del transformers_model
+        del ov_model
         gc.collect()
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -39,6 +39,7 @@
     "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
     "bloom": "hf-internal-testing/tiny-random-BloomModel",
     "camembert": "hf-internal-testing/tiny-random-camembert",
+    "clip": "hf-tiny-model-private/tiny-random-CLIPModel",
     "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
     "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM",
     "chatglm": "katuni4ka/tiny-random-chatglm2",
@@ -154,6 +155,7 @@
     "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random",
     "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM",
     "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM",
+    "siglip": "katuni4ka/tiny-random-SiglipModel",
     "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "sew": "hf-internal-testing/tiny-random-SEWModel",
     "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h",
@@ -223,6 +225,7 @@
     "ltx-video": (34, 28, 28, 64),
     "sam": (102, 100),
     "speecht5": (28, 52, 10, 80),
+    "clip": (130,),
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,7 @@`
`69`	`69`	`OVModelForQuestionAnswering,`
`70`	`70`	`OVModelForSequenceClassification,`
`71`	`71`	`OVModelForTokenClassification,`
	`72`	`+ OVModelForZeroShotImageClassification,`
`72`	`73`	`)`
`73`	`74`	`from .modeling_decoder import OVModelForCausalLM`
`74`	`75`	`from .modeling_open_clip import (`