Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Change Logs
0.4.0
+++++

* :pr:`52`: add support for zero-shot-image-classification
* :pr:`50`: add support for onnxruntime fusion
* :pr:`48`: add support for EncoderDecoderCache, test with openai/whisper-tiny
* :pr:`45`: improve change_dynamic_dimension to fix some dimensions
Expand Down
11 changes: 11 additions & 0 deletions _unittests/ut_torch_models/test_hghub_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,17 @@ def test_get_untrained_model_with_inputs_codellama(self):
# different expected value for different version of transformers
self.assertIn((data["size"], data["n_weights"]), [(410532864, 102633216)])

@hide_stdout()
@ignore_errors(OSError)
def test_get_untrained_model_with_inputs_clip_vit(self):
mid = "openai/clip-vit-base-patch16"
data = get_untrained_model_with_inputs(mid, verbose=1)
model, inputs = data["model"], data["inputs"]
with bypass_export_some_errors(patch_transformers=True):
model(**inputs)
# different expected value for different version of transformers
self.assertIn((data["size"], data["n_weights"]), [(188872708, 47218177)])

@hide_stdout()
def test_get_untrained_model_with_inputs_text2text_generation(self):
mid = "sshleifer/tiny-marian-en-de"
Expand Down
47 changes: 47 additions & 0 deletions _unittests/ut_torch_models/try_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,53 @@ def test_image_classification(self):
outputs = model(**inputs)
print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))

@never_test()
def test_image_classification_resnet(self):
# clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k resnet

from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

processor = ViTImageProcessor.from_pretrained("microsoft/resnet-50")
model = ViTModel.from_pretrained("microsoft/resnet-50")
inputs = processor(images=image, return_tensors="pt")
print()
print("-- inputs", string_type(inputs, with_shape=True, with_min_max=True))

outputs = model(**inputs)
print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))

@never_test()
def test_zero_shot_image_classification(self):
# clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k zero
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(
text=["a photo of a cat", "a photo of a dog"],
images=[image, image],
return_tensors="pt",
padding=True,
)
print()
print("-- inputs", string_type(inputs, with_shape=True, with_min_max=True))
outputs = model(**inputs)
print("-- outputs", string_type(outputs, with_shape=True, with_min_max=True))
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(
dim=1
) # we can take the softmax to get the label probabilities
assert probs is not None

@never_test()
def test_text2text_generation(self):
# clear&&NEVERTEST=1 python _unittests/ut_torch_models/try_tasks.py -k text2t
Expand Down
84 changes: 37 additions & 47 deletions onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import inspect
import sys
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import torch
Expand Down Expand Up @@ -44,56 +43,47 @@ def _patch_make_causal_mask(
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


if sys.version_info[:2] <= (3, 11):

@dataclass
class patched_AttentionMaskConverter:
"""
Patches
``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
"""

_PATCHES_ = ["_make_causal_mask"]
_PATCHED_CLASS_ = AttentionMaskConverter

@staticmethod
def _make_causal_mask(
input_ids_shape: torch.Size,
dtype: torch.dtype,
device: torch.device,
past_key_values_length: int = 0,
sliding_window: Optional[int] = None,
):
"""Patched method."""
return _patch_make_causal_mask(
input_ids_shape, dtype, device, past_key_values_length, sliding_window
)
@dataclass
class patched_AttentionMaskConverter:
"""
Patches
``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
"""

else:
_PATCHES_ = ["_make_causal_mask"]
_PATCHED_CLASS_ = AttentionMaskConverter

@dataclass
class patched_AttentionMaskConverter:
"""
Patches
``transformers.modeling_attn_mask_utils.AttentionMaskConverter._make_causal_mask``.
@staticmethod
def _make_causal_mask(
*args,
**kwargs,
# input_ids_shape: torch.Size,
# dtype: torch.dtype,
# device: torch.device,
# past_key_values_length: int = 0,
# sliding_window: Optional[int] = None,
):
"""
Patched method.

_PATCHES_ = ["_make_causal_mask"]
_PATCHED_CLASS_ = AttentionMaskConverter

@staticmethod
def _make_causal_mask(
self,
input_ids_shape: torch.Size,
dtype: torch.dtype,
device: torch.device,
past_key_values_length: int = 0,
sliding_window: Optional[int] = None,
):
"""Patched method."""
return _patch_make_causal_mask(
input_ids_shape, dtype, device, past_key_values_length, sliding_window
)
This static method may be called with ``AttentionMaskConverter._make_causal_mask``
or ``self._make_causal_mask``. That changes this argument is receives.
That should not matter but...
"""
if args:
index = 0 if isinstance(args[0], (tuple, torch.Size)) else 1
names = [
"input_ids_shape",
"dtype",
"device",
"past_key_values_length",
"sliding_window",
]
for i, a in enumerate(args):
if i < index:
continue
kwargs[names[i - index]] = a
return _patch_make_causal_mask(**kwargs)


class patched_DynamicCache:
Expand Down
15 changes: 14 additions & 1 deletion onnx_diagnostic/torch_models/hghub/hub_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@
import transformers
from huggingface_hub import HfApi, model_info
from . import hub_data_cached_configs
from .hub_data import __date__, __data_tasks__, load_architecture_task
from .hub_data import __date__, __data_tasks__, load_architecture_task, __data_arch_values__


@functools.cache
def get_architecture_default_values(architecture: str):
"""
The configuration may miss information to build the dummy inputs.
This information returns the missing pieces.
"""
assert architecture in __data_arch_values__, (
f"No known default values for {architecture!r}, "
f"expecting one architecture in {', '.join(sorted(__data_arch_values__))}"
)
return __data_arch_values__[architecture]


@functools.cache
Expand Down
28 changes: 15 additions & 13 deletions onnx_diagnostic/torch_models/hghub/hub_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

__date__ = "2025-03-26"

__data_arch_values__ = {"ResNetForImageClassification": dict(image_size=224)}

__data_arch__ = textwrap.dedent(
"""
architecture,task
Expand Down Expand Up @@ -127,25 +129,25 @@
)

__data_tasks__ = [
"audio-classification",
"automatic-speech-recognition",
"image-text-to-text",
"image-to-text",
"text-generation",
"object-detection",
"document-question-answering",
"feature-extraction",
"text-to-audio",
"zero-shot-image-classification",
"fill-mask",
"image-classification",
"image-feature-extraction",
"image-segmentation",
"reinforcement-learning",
"image-text-to-text",
"image-to-text",
"keypoint-detection",
"mask-generation",
"no-pipeline-tag",
"image-classification",
"object-detection",
"reinforcement-learning",
"text-generation",
"text-to-audio",
"text2text-generation",
"mask-generation",
"keypoint-detection",
"audio-classification",
"image-feature-extraction",
"fill-mask",
"zero-shot-image-classification",
]

__models_testing__ = """
Expand Down
50 changes: 50 additions & 0 deletions onnx_diagnostic/torch_models/hghub/hub_data_cached_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3389,3 +3389,53 @@ def _ccached_openai_whisper_tiny():
"vocab_size": 51865,
}
)


def _ccached_openai_clip_vit_base_patch16():
"openai/clip-vit-base-patch16"
return transformers.CLIPConfig(
**{
"architectures": ["CLIPModel"],
"initializer_factor": 1.0,
"logit_scale_init_value": 2.6592,
"model_type": "clip",
"projection_dim": 512,
"text_config": {
"attention_dropout": 0.0,
"bos_token_id": 0,
"dropout": 0.0,
"eos_token_id": 2,
"hidden_act": "quick_gelu",
"hidden_size": 512,
"initializer_factor": 1.0,
"initializer_range": 0.02,
"intermediate_size": 2048,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 77,
"model_type": "clip_text_model",
"num_attention_heads": 8,
"num_hidden_layers": 12,
"projection_dim": 512,
"vocab_size": 49408,
},
"torch_dtype": "float32",
"transformers_version": "4.52.0.dev0",
"vision_config": {
"attention_dropout": 0.0,
"dropout": 0.0,
"hidden_act": "quick_gelu",
"hidden_size": 768,
"image_size": 224,
"initializer_factor": 1.0,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"model_type": "clip_vision_model",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 16,
"projection_dim": 512,
},
}
)
Loading
Loading