Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
9afac3c
Initial commit
danielssonsimonbcg Sep 2, 2025
4bc0fab
Add OnnxConfig
simondanielsson Sep 2, 2025
ae1bfce
Add gemma3 to list of models requiring position ids
simondanielsson Sep 2, 2025
1775f54
Add vocab size to text config
simondanielsson Sep 2, 2025
35a647f
Add normaliezd text and vision config to Gemma3
simondanielsson Sep 2, 2025
454d47b
Update normalized config to grab from manager
simondanielsson Sep 2, 2025
cd831b6
Remove unused logigng
simondanielsson Sep 3, 2025
97c09b9
Update dummy input generator
simondanielsson Sep 3, 2025
03d4de3
Add gemma3 tests
simondanielsson Sep 3, 2025
8345410
Improve formatting of Gemma3OnnxConfig
simondanielsson Sep 3, 2025
55b5280
Add gemma3 onnxruntime tests
simondanielsson Sep 3, 2025
510c2b7
Add Gemma and Gemma3 to list of supported models in docs
simondanielsson Sep 3, 2025
41f0c75
Add Gemma3 to test_decoder.py
simondanielsson Sep 3, 2025
709fff4
Remove commented code
simondanielsson Sep 3, 2025
329062c
Reset formatting in onnx.py
simondanielsson Sep 3, 2025
401f5f7
Remove .DS_Store
simondanielsson Sep 3, 2025
38407e9
Fix formatting
simondanielsson Sep 3, 2025
bd4f48c
Stub base VLM onnx config
simondanielsson Sep 4, 2025
33e2ec6
First version of multimodal OnnxConfig
simondanielsson Sep 4, 2025
d9bd16c
Allow registering custom classes and tasks from optimum-onnx
simondanielsson Sep 4, 2025
04709f9
Implement monolith export of Gemma3
simondanielsson Sep 5, 2025
30fde4c
Add support for exporting multi submodels
simondanielsson Sep 5, 2025
57499b8
Add support for exporting for feature-extraction
simondanielsson Sep 6, 2025
ad68aa1
Add support for with-past
simondanielsson Sep 8, 2025
b420d36
Move classes to appropriate modules
simondanielsson Sep 8, 2025
90bf94c
Remove TODO comment
simondanielsson Sep 8, 2025
e973905
Remove another comment
simondanielsson Sep 8, 2025
01611f4
Rename constant
simondanielsson Sep 8, 2025
7eca9c6
Fix with make style
simondanielsson Sep 8, 2025
e1cedf4
Remove dev from transformer version
simondanielsson Sep 9, 2025
bf27714
Add copyright/license in top of input_generators.py
simondanielsson Sep 9, 2025
a8c0d4a
Remove injection of preprocessors
simondanielsson Sep 9, 2025
54b7e2b
Generate random inputs
simondanielsson Sep 9, 2025
da4b9de
Update VLMDecoderOnnxConfig to be streamlined
simondanielsson Sep 9, 2025
3b2cca6
Add large model test
simondanielsson Sep 9, 2025
522264f
Remove ds.store
simondanielsson Sep 9, 2025
8c0c97e
Propagate behaviors correctly
simondanielsson Sep 9, 2025
eea29dc
Remove modelpatcher and implement direclty in patch_model_for_export
simondanielsson Sep 9, 2025
1359fb1
Remove unused import
simondanielsson Sep 9, 2025
c0148b3
Apply use_cache if use_past is enabled and running LM or monolith
simondanielsson Sep 9, 2025
d6ff393
Add failing tests
simondanielsson Sep 9, 2025
6253636
Attach correct configs for submodels
simondanielsson Sep 9, 2025
e400e07
Add export of multimodal projector
simondanielsson Sep 10, 2025
2f7aa20
Add LM patcher from optimum-intel
simondanielsson Sep 10, 2025
710057d
Remove import of override
simondanielsson Sep 10, 2025
ddd4e74
Update tests
simondanielsson Sep 10, 2025
e9a8811
Run make style
simondanielsson Sep 10, 2025
436b59e
Remove LM head and add text_encoder
simondanielsson Sep 11, 2025
6398669
Remove LM head from tests
simondanielsson Sep 11, 2025
37ea777
Add LANGUAGE_MODEL_WITH_HEAD behvaior
simondanielsson Sep 11, 2025
67a4302
Update tests to use _with_head
simondanielsson Sep 11, 2025
60b250f
Update exported submodels
simondanielsson Sep 11, 2025
c7bb9f9
Stub modeling_vlm module and proper onnxruntime tests
simondanielsson Sep 11, 2025
e76ab45
Remove unused injection of processors in OnnxConfigWithPast
simondanielsson Sep 11, 2025
9b839bc
Remove dsstore
simondanielsson Sep 11, 2025
99be0fb
Change superclass of ORTModelForVIualCausalLM ot ORTModel
simondanielsson Sep 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/onnx/overview.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
- ESM
- Falcon
- Flaubert
- Gemma
- Gemma3
- GPT-2
- GPT-BigCode
- GPT-J
Expand Down
2 changes: 2 additions & 0 deletions optimum/exporters/onnx/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ class OnnxConfig(ExporterConfig, ABC):
"image-to-image": OrderedDict(
{"reconstruction": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}}
),
"image-text-to-text": OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}}),
"image-text-to-text-with-past": OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}}),
"keypoint-detection": OrderedDict(
{"heatmaps": {0: "batch_size", 1: "num_keypoints", 2: "height", 3: "width"}}
),
Expand Down
260 changes: 259 additions & 1 deletion optimum/exporters/onnx/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@

from __future__ import annotations

import enum
from collections import OrderedDict
from collections.abc import Iterable
from pathlib import Path
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, ClassVar, Self

from optimum.exporters.onnx.base import ConfigBehavior, OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
from optimum.exporters.onnx.constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
from optimum.exporters.onnx.model_patcher import ModelPatcher
from optimum.exporters.tasks import TasksManager
from optimum.utils import (
DummyAudioInputGenerator,
DummyBboxInputGenerator,
Expand Down Expand Up @@ -465,3 +468,258 @@ def post_process_exported_models(
models_and_onnx_configs[ONNX_DECODER_WITH_PAST_NAME][1]._decoder_onnx_config.is_merged = True

return models_and_onnx_configs, onnx_files_subpaths


class VLMConfigBehavior(str, enum.Enum):
"""Specifies the behavior of the [`~exporters.onnx.base.VLMDecoderOnnxConfig`].

- MONOLITH: the config can be used to export the entire multimodal model as a single file.
- VISION_ENCODER: the config can be used to export the underlying vision encoder.
- MULTIMODAL_PROJECTOR: the config can be used to export the underlying multimodal projector.
- TEXT_ENCODER: the config can be used to export the underlying text encoder, mapping inputs ids to embeddings.
- LANGUAGE_MODEL: the config can be used to export the underlying language model. Note: this does not
include the language model head.
"""

MONOLITH = "monolith"
VISION_ENCODER = "vision_encoder"
MULTIMODAL_PROJECTOR = "multimodal_projector"
TEXT_ENCODER = "text_encoder"
LANGUAGE_MODEL = "language_model"
LANGUAGE_MODEL_WITH_HEAD = "language_model_with_head"


class VLMDecoderOnnxConfig(TextDecoderOnnxConfig):
"""Base config for decoder-based vision language models."""

DUMMY_INPUT_GENERATOR_CLASSES = TextAndVisionOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
SUPPORTED_BEHAVIORS: ClassVar[list[VLMConfigBehavior]] = list(VLMConfigBehavior)

def __init__(
self,
config: PretrainedConfig,
task: str = "feature-extraction",
int_dtype: str = "int64",
float_dtype: str = "fp32",
use_past: bool = False,
use_past_in_inputs: bool = False,
preprocessors: list[Any] | None = None,
legacy: bool = False,
behavior: VLMConfigBehavior = VLMConfigBehavior.MONOLITH,
):
super().__init__(
config=config,
task=task,
int_dtype=int_dtype,
float_dtype=float_dtype,
use_past=use_past,
use_past_in_inputs=use_past_in_inputs,
preprocessors=preprocessors,
legacy=legacy,
)
self._behavior = behavior

@property
def behavior(self) -> VLMConfigBehavior:
"""The behavior property."""
return self._behavior

@behavior.setter
def behavior(self, value: str | VLMConfigBehavior) -> None:
if isinstance(value, str):
try:
value = VLMConfigBehavior(value)
except ValueError:
raise ValueError(
f"behavior must be one of {self.SUPPORTED_BEHAVIORS}, but got {value} instead."
) from None

self._behavior = value

def get_supported_behaviors(self, task: str) -> list[VLMConfigBehavior]:
"""Get supported behaviors for this model.

The supported behaviors are task-dependent. For instance, "text-generation" is handled by
the language model and associated head.
"""
if "image-text-to-text" in task:
# All parts of the model
return [
VLMConfigBehavior.VISION_ENCODER,
VLMConfigBehavior.MULTIMODAL_PROJECTOR,
VLMConfigBehavior.TEXT_ENCODER,
VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD,
]

elif "text-generation" in task:
# Only text-related components needed
return [
VLMConfigBehavior.TEXT_ENCODER,
VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD,
]

elif "feature-extraction" in task:
# Same as image-text-to-text but without the LM head
return [
VLMConfigBehavior.VISION_ENCODER,
VLMConfigBehavior.MULTIMODAL_PROJECTOR,
VLMConfigBehavior.TEXT_ENCODER,
VLMConfigBehavior.LANGUAGE_MODEL,
]

else:
message = f"Invalid task for {self.__class__.__name__}: {task}"
raise ValueError(message)

def with_behavior(self, behavior: VLMConfigBehavior) -> Self:
if behavior == VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD:
model_config = self._config.text_config
model_type = model_config.model_type

if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
raise ValueError(
f"Unsupported language model type provided `{model_type}`. Please define custom export config"
)

lm_task = "text-generation-with-past" if self.use_past else "text-generation"
exporter_config_constructor = TasksManager.get_exporter_config_constructor(
exporter="onnx",
model_type=model_type,
task=lm_task,
)
return exporter_config_constructor(
model_config,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
use_past=self.use_past,
use_past_in_inputs=self.use_past_in_inputs,
)

elif behavior in [
VLMConfigBehavior.MONOLITH,
VLMConfigBehavior.TEXT_ENCODER,
VLMConfigBehavior.VISION_ENCODER,
VLMConfigBehavior.MULTIMODAL_PROJECTOR,
VLMConfigBehavior.LANGUAGE_MODEL,
]:
# TODO: check if we need to handle vision encoder part similarly, with config.vision_config
return type(self)(
config=self._config,
task=self.task,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
use_past=self.use_past,
use_past_in_inputs=self.use_past_in_inputs,
preprocessors=self._preprocessors,
legacy=self.legacy,
behavior=behavior,
)

message = f"Behavior must be one of {self.SUPPORTED_BEHAVIORS}, but got {behavior} instead."
raise ValueError(message)

def get_model_for_behavior(self, model: PreTrainedModel, behavior: VLMConfigBehavior):
if behavior != self.behavior:
raise ValueError(
f"Config behavior {self.behavior} does not match the requested behavior {behavior}. Please run `.with_behavior` first."
)

if behavior == VLMConfigBehavior.LANGUAGE_MODEL:
return model.language_model

if behavior == VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD:
# No default way to get just the LM and LM head, so we get entire model.
return model

if behavior == VLMConfigBehavior.VISION_ENCODER:
vision_encoder = model.vision_tower
vision_encoder.config = model.config.vision_config
return vision_encoder

if behavior == VLMConfigBehavior.MULTIMODAL_PROJECTOR:
multi_modal_projector = model.multi_modal_projector
# TODO: check if multimodal projector actually acceps the base config, not config.vision_config
multi_modal_projector.config = model.config
return multi_modal_projector

if behavior == VLMConfigBehavior.MONOLITH:
return model

if behavior == VLMConfigBehavior.TEXT_ENCODER:
return model.get_input_embeddings()

message = f"Behavior must be one of {self.SUPPORTED_BEHAVIORS}, but got {behavior} instead."
raise ValueError(message)

@property
def inputs(self) -> dict[str, dict[int, str]]:
if self.behavior == VLMConfigBehavior.VISION_ENCODER:
return {"pixel_values": {0: "batch_size"}}

if self.behavior == VLMConfigBehavior.MULTIMODAL_PROJECTOR:
# Should be batch_size, number of tokens per image, and hidden size of the vision encoder
return {
"vision_outputs": {
0: "batch_size",
1: "num_patch_tokens",
2: "hidden_size",
}
}

if self.behavior in (
VLMConfigBehavior.LANGUAGE_MODEL,
VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD,
):
return super().inputs

if self.behavior == VLMConfigBehavior.MONOLITH:
inputs = super().inputs

# text-generation task should not include images.
if "image-text-to-text" in self.task:
# No need to add channel and image dimensions
inputs["pixel_values"] = {0: "batch_size"}

return inputs

if self.behavior == VLMConfigBehavior.TEXT_ENCODER:
return super().inputs

message = f"Behavior must be one of {self.SUPPORTED_BEHAVIORS}, but got {self.behavior} instead."
raise ValueError(message)

@property
def outputs(self) -> dict[str, dict[int, str]]:
if self.behavior in (
VLMConfigBehavior.VISION_ENCODER,
VLMConfigBehavior.LANGUAGE_MODEL,
):
return {"last_hidden_state": {0: "batch_size"}}

if self.behavior == VLMConfigBehavior.MULTIMODAL_PROJECTOR:
return {
"image_features": {
0: "batch_size",
1: "mm_tokens_per_image",
2: "text_hidden_size",
}
}

if self.behavior == VLMConfigBehavior.TEXT_ENCODER:
return {"inputs_embeds": {0: "batch_size", 1: "sequence_length"}}

return super().outputs

def patch_model_for_export(
self, model: PreTrainedModel, model_kwargs: dict[str, Any] | None = None
) -> ModelPatcher:
if self.behavior in (
VLMConfigBehavior.LANGUAGE_MODEL,
VLMConfigBehavior.LANGUAGE_MODEL_WITH_HEAD,
VLMConfigBehavior.MONOLITH,
):
if model_kwargs is None:
model_kwargs = {}
model_kwargs["use_cache"] = self.use_past

return super().patch_model_for_export(model=model, model_kwargs=model_kwargs)
2 changes: 2 additions & 0 deletions optimum/exporters/onnx/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,5 @@
"musicgen",
"whisper",
]

VLM_TEXT_GENERATION_MODELS = ["gemma3"]
Loading