Add Metaclip-2 export (#71)

VladOS95-cyber · IlyasMoutawwakil · web-flow · commit 0ad63df5a0f1 · 2025-10-09T10:37:24.000+02:00
This PR adds support for exporting Metaclip-2 models and
Zero-shot-for-image-classification ort model.

---------

Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
diff --git a/docs/source/onnx/overview.mdx b/docs/source/onnx/overview.mdx
@@ -79,6 +79,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - MarkupLM
 - MaskFormer
 - MBart
+- MetaClip2
 - MGP-STR
 - Mistral
 - MobileBert
diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -68,6 +68,11 @@ The following ORT classes are available for the following computer vision tasks.
 [[autodoc]] onnxruntime.ORTModelForImageClassification
     - forward
 
+### ORTModelForZeroShotImageClassification
+
+[[autodoc]] onnxruntime.ORTModelForZeroShotImageClassification
+    - forward
+
 ### ORTModelForSemanticSegmentation
 
 [[autodoc]] onnxruntime.ORTModelForSemanticSegmentation
diff --git a/optimum/exporters/onnx/__init__.py b/optimum/exporters/onnx/__init__.py
@@ -29,7 +29,7 @@
         "validate_models_outputs",
         "onnx_export_from_model",
     ],
-    "utils": ["MODEL_TYPES_REQUIRING_POSITION_IDS"],
+    "utils": ["MODEL_TYPES_REQUIRING_POSITION_IDS", "get_metaclip_2_models_for_export"],
     "__main__": ["main_export"],
 }
 
@@ -44,7 +44,7 @@
         validate_model_outputs,
         validate_models_outputs,
     )
-    from optimum.exporters.onnx.utils import MODEL_TYPES_REQUIRING_POSITION_IDS
+    from optimum.exporters.onnx.utils import MODEL_TYPES_REQUIRING_POSITION_IDS, get_metaclip_2_models_for_export
 else:
     import sys
 
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
@@ -449,6 +449,7 @@ def main():
         pad_token_id=args.pad_token_id,
         library_name=args.library_name,
         do_constant_folding=not args.no_constant_folding,
+        slim=args.slim,
         **input_shapes,
     )
 
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
@@ -211,7 +211,7 @@ def validate_model_outputs(
         io_process.join()
 
         if io_process.exception:
-            error, traceback = io_process.exception
+            error, _ = io_process.exception
             raise error
     else:
         _run_validation(
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -42,6 +42,7 @@
     CLIPModelPatcher,
     CohereModelPatcher,
     FluxTransformerModelPatcher,
+    MetaCLIP2Patcher,
     MgpstrModelPatcher,
     MoonshineModelPatcher,
     MusicgenModelPatcher,
@@ -1247,6 +1248,85 @@ def outputs(self) -> dict[str, dict[int, str]]:
         return common_outputs
 
 
+@register_tasks_manager_onnx(
+    "metaclip_2",
+    *["feature-extraction", "zero-shot-image-classification", "image-classification"],
+    library_name="transformers",
+)
+class MetaCLIP2OnnxConfig(TextAndVisionOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = CLIPNormalizedConfig
+    MIN_TRANSFORMERS_VERSION = version.parse("4.56.2")
+    VARIANTS = {  # noqa: RUF012
+        "monolith": "All the MetaClip2 model components are exported as a single model.onnx.",
+        "split": "The vision model is exported as a separate vision_model.onnx, and the text_model is exported as text_model.onnx",
+    }
+    DEFAULT_VARIANT = "monolith"
+    _MODEL_PATCHER = MetaCLIP2Patcher
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        variant: str = "monolith",
+        vision_model: bool | None = None,
+        preprocessors: list[Any] | None = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self.variant = variant
+        self.vision_model = vision_model
+
+    @property
+    def inputs(self) -> dict[str, dict[int, str]]:
+        if self.variant == "monolith":
+            inputs = {"pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}}
+            if self.task in ["feature-extraction", "zero-shot-image-classification"]:
+                inputs.update(
+                    {
+                        "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+                        "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
+                    }
+                )
+        else:
+            if self.vision_model:
+                inputs = {"pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}}
+            else:
+                inputs = {
+                    "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+                    "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
+                }
+        return inputs
+
+    @property
+    def outputs(self) -> dict[str, dict[int, str]]:
+        if self.variant == "split":
+            if self.vision_model:
+                return {
+                    "image_embeds": {0: "batch_size"},
+                }
+            else:
+                return {
+                    "text_embeds": {0: "batch_size"},
+                }
+        else:
+            if self.task in ["feature-extraction", "zero-shot-image-classification"]:
+                return {
+                    "logits_per_image": {0: "image_batch_size", 1: "text_batch_size"},
+                    "logits_per_text": {0: "text_batch_size", 1: "image_batch_size"},
+                    "text_embeds": {0: "text_batch_size"},
+                    "image_embeds": {0: "image_batch_size"},
+                }
+            else:
+                return super().outputs
+
+
 class SiglipNormalizedConfig(CLIPNormalizedConfig):
     pass
 
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -1196,6 +1196,32 @@ def patched_forward(
             self.patched_forward = patched_forward
 
 
+class MetaCLIP2Patcher(ModelPatcher):
+    def __init__(
+        self,
+        config: OnnxConfig,
+        model: PreTrainedModel,
+        model_kwargs: dict[str, Any] | None = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None):
+            if config.variant == "monolith":
+                return self.orig_forward(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)
+
+            if config.variant == "split":
+                if config.vision_model:
+                    image_embeds = model.get_image_features(pixel_values)
+                    return {"image_embeds": image_embeds}
+
+                text_embeds = model.get_text_features(input_ids, attention_mask)
+                return {
+                    "text_embeds": text_embeds,
+                }
+
+        self.patched_forward = patched_forward
+
+
 class CLIPModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
@@ -1379,7 +1405,7 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 def patched_cohere_rotary_forward(self, x, position_ids):
     # Get batch size and sequence length for manual expansion
-    batch_size, seq_len = position_ids.shape[:2]
+    batch_size, _ = position_ids.shape[:2]
 
     # Instead of using expand, manually repeat the tensor.
     # Problem with expand: it creates a view with shared memory rather than copying data,
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
@@ -20,6 +20,8 @@
 import torch
 from transformers.utils import is_torch_available
 
+from optimum.exporters.base import ExporterConfig
+from optimum.exporters.tasks import TasksManager
 from optimum.exporters.utils import _get_submodels_and_export_configs
 from optimum.utils.import_utils import is_transformers_version
 
@@ -131,6 +133,38 @@ def __setstate__(self, values):
         self.sess = ort.InferenceSession(self.model_path, sess_options=self.sess_options, providers=self.providers)
 
 
+def _get_submodels_for_export_metaclip_2(model, variant):
+    models_for_export = {}
+
+    if variant == "monolith":
+        models_for_export["model"] = model
+    else:
+        # We rather use the model patcher to patch their forward method.
+        models_for_export["vision_model"] = model
+        models_for_export["text_model"] = model
+
+    return models_for_export
+
+
+def get_metaclip_2_models_for_export(model: PreTrainedModel, config: ExporterConfig):
+    models_for_export = _get_submodels_for_export_metaclip_2(model, config.variant)
+
+    if config.variant == "monolith":
+        export_config = config.__class__(model.config, task=config.task, variant=config.variant)
+        models_for_export["model"] = (models_for_export["model"], export_config)
+    else:
+        vision_model_export_config = config.__class__(
+            model.config, task=config.task, variant=config.variant, vision_model=True
+        )
+        text_model_export_config = config.__class__(
+            model.config, task=config.task, variant=config.variant, vision_model=False
+        )
+        models_for_export["vision_model"] = (models_for_export["vision_model"], vision_model_export_config)
+        models_for_export["text_model"] = (models_for_export["text_model"], text_model_export_config)
+
+    return models_for_export
+
+
 def _get_submodels_and_onnx_configs(
     model: PreTrainedModel,
     task: str,
@@ -145,6 +179,18 @@ def _get_submodels_and_onnx_configs(
     preprocessors: list[Any] | None = None,
     model_kwargs: dict | None = None,
 ):
+    if library_name == "transformers" and model.config.model_type == "metaclip_2":
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=model, exporter="onnx", task=task, library_name="transformers"
+        )
+        export_config = export_config_constructor(
+            model.config,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        export_config.variant = _variant
+        return export_config, get_metaclip_2_models_for_export(model, export_config)
     return _get_submodels_and_export_configs(
         model,
         task,
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
@@ -53,6 +53,7 @@
         "ORTModelForCTC",
         "ORTModelForFeatureExtraction",
         "ORTModelForImageClassification",
+        "ORTModelForZeroShotImageClassification",
         "ORTModelForMaskedLM",
         "ORTModelForMultipleChoice",
         "ORTModelForQuestionAnswering",
@@ -159,6 +160,7 @@
         ORTModelForSemanticSegmentation,
         ORTModelForSequenceClassification,
         ORTModelForTokenClassification,
+        ORTModelForZeroShotImageClassification,
     )
     from optimum.onnxruntime.modeling_seq2seq import (
         ORTModelForPix2Struct,
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -362,7 +362,7 @@ def forward(
         outputs_to_not_bind = set()
         if use_cache and self.use_io_binding:
             # Infers the shape of the output pkv
-            batch_size, seq_len = input_ids.shape
+            batch_size, _ = input_ids.shape
             if self.old_bloom_modeling:
                 num_key_value_heads_batch_size, embed_size_per_head = past_key_values[0].shape[:2]
                 k_shape = (num_key_value_heads_batch_size, embed_size_per_head, out_seq_len)
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
@@ -20,6 +20,7 @@
 import os
 import re
 from collections.abc import Sequence
+from dataclasses import dataclass
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any
@@ -42,6 +43,7 @@
     AutoModelForSemanticSegmentation,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
+    AutoModelForZeroShotImageClassification,
     GenerationMixin,
 )
 from transformers.file_utils import add_end_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
@@ -127,6 +129,29 @@
 """
 
 
+@dataclass
+class ZeroShotImageClassificationOutput(ModelOutput):
+    r"""logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        similarity scores.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`MetaClip2TextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The image embeddings obtained by applying the projection layer to the pooled output of [`MetaClip2VisionModel`].
+    """
+
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(self[k] for k in self.keys())
+
+
 # TODO: remove OptimizedModel and use a HubMixin to be able to combine it freely with other mixins
 class ORTModel(ORTSessionMixin, OptimizedModel):
     """Base class for implementing models using ONNX Runtime.
@@ -1317,6 +1342,67 @@ def forward(
         return ImageClassifierOutput(logits=logits)
 
 
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTModelForZeroShotImageClassification(ORTModel):
+    """ONNX Model for zero-shot-image-classification tasks. This class officially supports clip, metaclip-2."""
+
+    auto_model_class = AutoModelForZeroShotImageClassification
+
+    @add_start_docstrings_to_model_forward(
+        ONNX_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+        + ONNX_IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width")
+    )
+    def forward(
+        self,
+        input_ids: torch.Tensor | np.ndarray,
+        pixel_values: torch.Tensor | np.ndarray,
+        attention_mask: torch.Tensor | np.ndarray | None = None,
+        **kwargs,
+    ):
+        self._warn_on_unhandled_inputs(kwargs)
+        # Determine the tensor type from any available tensor input
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "attention_mask": attention_mask,
+        }
+
+        use_torch = isinstance(pixel_values, torch.Tensor)
+        self.raise_on_numpy_input_io_binding(use_torch)
+
+        if self.use_io_binding:
+            output_shapes, output_buffers = self._prepare_io_binding(model_inputs)
+
+            # run inference with binding & synchronize in case of multiple CUDA streams
+            if self.device.type == "cpu":
+                self.session.run_with_iobinding(self._io_binding)
+            else:
+                self._io_binding.synchronize_inputs()
+                self.session.run_with_iobinding(self._io_binding)
+                self._io_binding.synchronize_outputs()
+
+            logits_per_text = output_buffers["logits_per_text"].view(output_shapes["logits_per_text"])
+            logits_per_image = output_buffers["logits_per_image"].view(output_shapes["logits_per_image"])
+            text_embeds = output_buffers["text_embeds"].view(output_shapes["text_embeds"])
+            image_embeds = output_buffers["image_embeds"].view(output_shapes["image_embeds"])
+        else:
+            onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs)
+            onnx_outputs = self.session.run(None, onnx_inputs)
+            model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs)
+
+            logits_per_text = model_outputs["logits_per_text"]
+            logits_per_image = model_outputs["logits_per_image"]
+            text_embeds = model_outputs["text_embeds"]
+            image_embeds = model_outputs["image_embeds"]
+
+        return ZeroShotImageClassificationOutput(
+            logits_per_text=logits_per_text,
+            logits_per_image=logits_per_image,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+        )
+
+
 SEMANTIC_SEGMENTATION_EXAMPLE = r"""
     Example of semantic segmentation:
 
diff --git a/optimum/onnxruntime/pipelines.py b/optimum/onnxruntime/pipelines.py
diff --git a/tests/exporters/onnx/test_export.py b/tests/exporters/onnx/test_export.py
diff --git a/tests/exporters/onnx/utils_tests.py b/tests/exporters/onnx/utils_tests.py
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
diff --git a/tests/onnxruntime/testing_utils.py b/tests/onnxruntime/testing_utils.py

Original file line number	Diff line number	Diff line change
`@@ -449,6 +449,7 @@ def main():`
`449`	`449`	`pad_token_id=args.pad_token_id,`
`450`	`450`	`library_name=args.library_name,`
`451`	`451`	`do_constant_folding=not args.no_constant_folding,`
	`452`	`+ slim=args.slim,`
`452`	`453`	`**input_shapes,`
`453`	`454`	`)`
`454`	`455`