Add zero shot image classification (#52)

xadupre · web-flow · commit f2f1195f4d64 · 2025-04-14T19:34:00.000+02:00
* Add zero shot

* zero

* simplifies patch

* fix _make_causal
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
@@ -4,6 +4,7 @@ Change Logs
 0.4.0
 +++++
 
+* :pr:`52`: add support for zero-shot-image-classification
 * :pr:`50`: add support for onnxruntime fusion
 * :pr:`48`: add support for EncoderDecoderCache, test with openai/whisper-tiny
 * :pr:`45`: improve change_dynamic_dimension to fix some dimensions
diff --git a/_unittests/ut_torch_models/test_hghub_model.py b/_unittests/ut_torch_models/test_hghub_model.py
@@ -96,6 +96,17 @@ def test_get_untrained_model_with_inputs_codellama(self):
         # different expected value for different version of transformers
         self.assertIn((data["size"], data["n_weights"]), [(410532864, 102633216)])
 
+    @hide_stdout()
+    @ignore_errors(OSError)
+    def test_get_untrained_model_with_inputs_clip_vit(self):
+        mid = "openai/clip-vit-base-patch16"
+        data = get_untrained_model_with_inputs(mid, verbose=1)
+        model, inputs = data["model"], data["inputs"]
+        with bypass_export_some_errors(patch_transformers=True):
+            model(**inputs)
+        # different expected value for different version of transformers
+        self.assertIn((data["size"], data["n_weights"]), [(188872708, 47218177)])
+
     @hide_stdout()
     def test_get_untrained_model_with_inputs_text2text_generation(self):
         mid = "sshleifer/tiny-marian-en-de"
diff --git a/_unittests/ut_torch_models/try_tasks.py b/_unittests/ut_torch_models/try_tasks.py
@@ -25,6 +25,53 @@ def test_image_classification(self):
         outputs = model(**inputs)
         print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))
 
+    @never_test()
+    def test_image_classification_resnet(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k resnet
+
+        from transformers import ViTImageProcessor, ViTModel
+        from PIL import Image
+        import requests
+
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        processor = ViTImageProcessor.from_pretrained("microsoft/resnet-50")
+        model = ViTModel.from_pretrained("microsoft/resnet-50")
+        inputs = processor(images=image, return_tensors="pt")
+        print()
+        print("-- inputs", string_type(inputs, with_shape=True, with_min_max=True))
+
+        outputs = model(**inputs)
+        print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))
+
+    @never_test()
+    def test_zero_shot_image_classification(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k zero
+        from PIL import Image
+        import requests
+        from transformers import CLIPProcessor, CLIPModel
+
+        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
+        processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        inputs = processor(
+            text=["a photo of a cat", "a photo of a dog"],
+            images=[image, image],
+            return_tensors="pt",
+            padding=True,
+        )
+        print()
+        print("-- inputs", string_type(inputs, with_shape=True, with_min_max=True))
+        outputs = model(**inputs)
+        print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))
+        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        probs = logits_per_image.softmax(
+            dim=1
+        )  # we can take the softmax to get the label probabilities
+        assert probs is not None
+
     @never_test()
     def test_text2text_generation(self):
         # clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k text2t
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1,5 +1,4 @@
 import inspect
-import sys
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -44,56 +43,47 @@ def _patch_make_causal_mask(
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
-if sys.version_info[:2] <= (3, 11):
-
-    @dataclass
-    class patched_AttentionMaskConverter:
-        """
-        Patches
-        ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
-        """
-
-        _PATCHES_ = ["_make_causal_mask"]
-        _PATCHED_CLASS_ = AttentionMaskConverter
-
-        @staticmethod
-        def _make_causal_mask(
-            input_ids_shape: torch.Size,
-            dtype: torch.dtype,
-            device: torch.device,
-            past_key_values_length: int = 0,
-            sliding_window: Optional[int] = None,
-        ):
-            """Patched method."""
-            return _patch_make_causal_mask(
-                input_ids_shape, dtype, device, past_key_values_length, sliding_window
-            )
+@dataclass
+class patched_AttentionMaskConverter:
+    """
+    Patches
+    ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
+    """
 
-else:
+    _PATCHES_ = ["_make_causal_mask"]
+    _PATCHED_CLASS_ = AttentionMaskConverter
 
-    @dataclass
-    class patched_AttentionMaskConverter:
-        """
-        Patches
-        ``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
+    @staticmethod
+    def _make_causal_mask(
+        *args,
+        **kwargs,
+        # input_ids_shape: torch.Size,
+        # dtype: torch.dtype,
+        # device: torch.device,
+        # past_key_values_length: int = 0,
+        # sliding_window: Optional[int] = None,
+    ):
         """
+        Patched method.
 
-        _PATCHES_ = ["_make_causal_mask"]
-        _PATCHED_CLASS_ = AttentionMaskConverter
-
-        @staticmethod
-        def _make_causal_mask(
-            self,
-            input_ids_shape: torch.Size,
-            dtype: torch.dtype,
-            device: torch.device,
-            past_key_values_length: int = 0,
-            sliding_window: Optional[int] = None,
-        ):
-            """Patched method."""
-            return _patch_make_causal_mask(
-                input_ids_shape, dtype, device, past_key_values_length, sliding_window
-            )
+        This static method may be called with ``AttentionMaskConverter._make_causal_mask``
+        or ``self._make_causal_mask``. That changes this argument is receives.
+        That should not matter but...
+        """
+        if args:
+            index = 0 if isinstance(args[0], (tuple, torch.Size)) else 1
+            names = [
+                "input_ids_shape",
+                "dtype",
+                "device",
+                "past_key_values_length",
+                "sliding_window",
+            ]
+            for i, a in enumerate(args):
+                if i < index:
+                    continue
+                kwargs[names[i - index]] = a
+        return _patch_make_causal_mask(**kwargs)
 
 
 class patched_DynamicCache:
diff --git a/onnx_diagnostic/torch_models/hghub/hub_api.py b/onnx_diagnostic/torch_models/hghub/hub_api.py
@@ -4,7 +4,20 @@
 import transformers
 from huggingface_hub import HfApi, model_info
 from . import hub_data_cached_configs
-from .hub_data import __date__, __data_tasks__, load_architecture_task
+from .hub_data import __date__, __data_tasks__, load_architecture_task, __data_arch_values__
+
+
+@functools.cache
+def get_architecture_default_values(architecture: str):
+    """
+    The configuration may miss information to build the dummy inputs.
+    This information returns the missing pieces.
+    """
+    assert architecture in __data_arch_values__, (
+        f"No known default values for {architecture!r}, "
+        f"expecting one architecture in {', '.join(sorted(__data_arch_values__))}"
+    )
+    return __data_arch_values__[architecture]
 
 
 @functools.cache
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -5,6 +5,8 @@
 
 __date__ = "2025-03-26"
 
+__data_arch_values__ = {"ResNetForImageClassification": dict(image_size=224)}
+
 __data_arch__ = textwrap.dedent(
     """
     architecture,task
@@ -127,25 +129,25 @@
 )
 
 __data_tasks__ = [
+    "audio-classification",
     "automatic-speech-recognition",
-    "image-text-to-text",
-    "image-to-text",
-    "text-generation",
-    "object-detection",
     "document-question-answering",
     "feature-extraction",
-    "text-to-audio",
-    "zero-shot-image-classification",
+    "fill-mask",
+    "image-classification",
+    "image-feature-extraction",
     "image-segmentation",
-    "reinforcement-learning",
+    "image-text-to-text",
+    "image-to-text",
+    "keypoint-detection",
+    "mask-generation",
     "no-pipeline-tag",
-    "image-classification",
+    "object-detection",
+    "reinforcement-learning",
+    "text-generation",
+    "text-to-audio",
     "text2text-generation",
-    "mask-generation",
-    "keypoint-detection",
-    "audio-classification",
-    "image-feature-extraction",
-    "fill-mask",
+    "zero-shot-image-classification",
 ]
 
 __models_testing__ = """
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py b/onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
@@ -3389,3 +3389,53 @@ def _ccached_openai_whisper_tiny():
             "vocab_size": 51865,
         }
     )
+
+
+def _ccached_openai_clip_vit_base_patch16():
+    "openai/clip-vit-base-patch16"
+    return transformers.CLIPConfig(
+        **{
+            "architectures": ["CLIPModel"],
+            "initializer_factor": 1.0,
+            "logit_scale_init_value": 2.6592,
+            "model_type": "clip",
+            "projection_dim": 512,
+            "text_config": {
+                "attention_dropout": 0.0,
+                "bos_token_id": 0,
+                "dropout": 0.0,
+                "eos_token_id": 2,
+                "hidden_act": "quick_gelu",
+                "hidden_size": 512,
+                "initializer_factor": 1.0,
+                "initializer_range": 0.02,
+                "intermediate_size": 2048,
+                "layer_norm_eps": 1e-05,
+                "max_position_embeddings": 77,
+                "model_type": "clip_text_model",
+                "num_attention_heads": 8,
+                "num_hidden_layers": 12,
+                "projection_dim": 512,
+                "vocab_size": 49408,
+            },
+            "torch_dtype": "float32",
+            "transformers_version": "4.52.0.dev0",
+            "vision_config": {
+                "attention_dropout": 0.0,
+                "dropout": 0.0,
+                "hidden_act": "quick_gelu",
+                "hidden_size": 768,
+                "image_size": 224,
+                "initializer_factor": 1.0,
+                "initializer_range": 0.02,
+                "intermediate_size": 3072,
+                "layer_norm_eps": 1e-05,
+                "model_type": "clip_vision_model",
+                "num_attention_heads": 12,
+                "num_channels": 3,
+                "num_hidden_layers": 12,
+                "patch_size": 16,
+                "projection_dim": 512,
+            },
+        }
+    )
diff --git a/onnx_diagnostic/torch_models/hghub/model_inputs.py b/onnx_diagnostic/torch_models/hghub/model_inputs.py