feat: add support for gemma3-text

thewh1teagle · thewh1teagle · commit e6636338ec39 · 2025-10-15T06:22:53.000+03:00
simplify the example
diff --git a/examples/gemma3.py b/examples/gemma3.py
@@ -0,0 +1,25 @@
+"""Simple example: Export Gemma3 270M to ONNX and generate text.
+
+Usage:
+    uv pip install onnxruntime
+    uv run examples/gemma3.py
+"""
+
+from transformers import AutoTokenizer
+
+from optimum.onnxruntime import ORTModelForCausalLM
+
+
+model_id = "google/gemma-3-270m-it"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = ORTModelForCausalLM.from_pretrained(model_id, export=True)
+
+# Chat with instruction-tuned model
+conversation = [{"role": "user", "content": "Hello! How are you?"}]
+prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer(prompt, return_tensors="pt")
+
+outputs = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+print(response)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -43,6 +43,7 @@
     CohereModelPatcher,
     FluxTransformerModelPatcher,
     MetaCLIP2Patcher,
+    Gemma3LMModelPatcher,
     MgpstrModelPatcher,
     MoonshineModelPatcher,
     MusicgenModelPatcher,
@@ -517,6 +518,14 @@ class Gemma2OnnxConfig(TextDecoderOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.53.0")
 
 
+@register_tasks_manager_onnx("gemma3", *COMMON_TEXT_GENERATION_TASKS)
+@register_tasks_manager_onnx("gemma3_text", *COMMON_TEXT_GENERATION_TASKS)
+class Gemma3OnnxConfig(GemmaOnnxConfig):
+    """ONNX config for Gemma3 text-only models."""
+    MIN_TRANSFORMERS_VERSION = version.parse("4.52.0")
+    _MODEL_PATCHER = Gemma3LMModelPatcher
+
+
 @register_tasks_manager_onnx("gpt_oss", *COMMON_TEXT_GENERATION_TASKS)
 class GPTOssOnnxConfig(GemmaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.55.0")
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -18,7 +18,7 @@
 import inspect
 import sys
 import types
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
 import torch
 import transformers
@@ -30,6 +30,7 @@
     jit_utils,
     symbolic_helper,
 )
+from transformers import PreTrainedModel, TFPreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
 
@@ -1444,3 +1445,57 @@ def __exit__(self, exc_type, exc_value, traceback):
             from transformers.models.cohere.modeling_cohere import CohereRotaryEmbedding
 
             CohereRotaryEmbedding.forward = self.original_forward
+
+
+class Gemma3LMModelPatcher(DecoderModelPatcher):
+    """Patcher for Gemma3 language model to handle cache conversion for ONNX export."""
+
+    def __init__(
+        self,
+        config,
+        model: Union[PreTrainedModel, TFPreTrainedModel],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        def forward(
+            self,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            inputs_embeds,
+            use_cache=True,
+        ):
+            from transformers.cache_utils import DynamicCache
+
+            pkv = DynamicCache.from_legacy_cache(past_key_values)
+
+            past_seen_tokens = past_key_values[0][0].shape[-2] if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+            result = self.__orig_forward(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                cache_position=cache_position,
+                past_key_values=pkv,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+            )
+            upd_pkv = result["past_key_values"]
+            result["past_key_values"] = upd_pkv.to_legacy_cache()
+            return result
+
+        if is_transformers_version("<", "4.53.0"):
+            model.__orig_forward = model.forward
+            model.forward = types.MethodType(forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+
+        if is_transformers_version("<", "4.53.0"):
+            self._model.forward = self._model.__orig_forward
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -185,7 +185,7 @@ def __init__(
                 "To re-export your model, simply set `export=True` as in `from_pretrained(..., export=True, use_cache=True)`."
             )
 
-        if self.config.model_type in {"gemma", "gpt_oss", "nemotron"}:
+        if self.config.model_type in {"gemma", "gemma3", "gemma3_text", "gpt_oss", "nemotron"}:
             self.embed_size_per_head = self.config.head_dim
         elif self.old_gpt_bigcode_modeling:
             # (before v4.54) GPT BigCode fuses keys and values in one tensor, doubling the head dimension
@@ -202,6 +202,8 @@ def __init__(
             "deepseek_v3",
             "cohere",
             "gemma",
+            "gemma3",
+            "gemma3_text",
             "glm",
             "granite",
             "gpt_oss",
diff --git a/uv.lock b/uv.lock