huggingface · IlyasMoutawwakil · Oct 22, 2025 · Oct 4, 2025 · Oct 15, 2025 · Oct 18, 2025
diff --git a/README.md b/README.md
@@ -56,3 +56,7 @@ Once the model is exported to the ONNX format, we provide Python classes enablin
 ```
 
 More details on how to run ONNX models with `ORTModelForXXX` classes [here](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/models).
+
+### Examples
+
+Check out the [examples folder](./examples) for more usage examples including optimization, quantization, and model-specific demonstrations.
diff --git a/examples/gemma3.py b/examples/gemma3.py
@@ -0,0 +1,25 @@
+"""Simple example: Export Gemma3 270M to ONNX and generate text.
+
+Usage:
+    uv pip install onnxruntime
+    uv run examples/gemma3.py
+"""
+
+from transformers import AutoTokenizer
+
+from optimum.onnxruntime import ORTModelForCausalLM
+
+
+model_id = "google/gemma-3-270m-it"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = ORTModelForCausalLM.from_pretrained(model_id, export=True)
+
+# Chat with instruction-tuned model
+conversation = [{"role": "user", "content": "Hello! How are you?"}]
+prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer(prompt, return_tensors="pt")
+
+outputs = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+print(response)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -517,6 +517,13 @@ class Gemma2OnnxConfig(TextDecoderOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.53.0")
 
 
+@register_tasks_manager_onnx("gemma3", *COMMON_TEXT_GENERATION_TASKS)
+@register_tasks_manager_onnx("gemma3_text", *COMMON_TEXT_GENERATION_TASKS)
+class Gemma3OnnxConfig(GemmaOnnxConfig):
+    """ONNX config for Gemma3 text-only models."""
+    MIN_TRANSFORMERS_VERSION = version.parse("4.52.0")
+
+
 @register_tasks_manager_onnx("gpt_oss", *COMMON_TEXT_GENERATION_TASKS)
 class GPTOssOnnxConfig(GemmaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.55.0")

diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -185,7 +185,7 @@ def __init__(
                 "To re-export your model, simply set `export=True` as in `from_pretrained(..., export=True, use_cache=True)`."
             )
 
-        if self.config.model_type in {"gemma", "gpt_oss", "nemotron"}:
+        if self.config.model_type in {"gemma", "gemma3", "gemma3_text", "gpt_oss", "nemotron"}:
             self.embed_size_per_head = self.config.head_dim
         elif self.old_gpt_bigcode_modeling:
             # (before v4.54) GPT BigCode fuses keys and values in one tensor, doubling the head dimension
@@ -202,6 +202,8 @@ def __init__(
             "deepseek_v3",
             "cohere",
             "gemma",
+            "gemma3",
+            "gemma3_text",
             "glm",
             "granite",
             "gpt_oss",

diff --git a/tests/exporters/onnx/utils_tests.py b/tests/exporters/onnx/utils_tests.py
@@ -103,6 +103,8 @@
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gemma2": "hf-internal-testing/tiny-random-Gemma2ForCausalLM",
+    "gemma3": "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
+    "gemma3_text": "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
     "glm": "hf-internal-testing/tiny-random-GlmForCausalLM",
     "glpn": "hf-internal-testing/tiny-random-GLPNModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",

diff --git a/tests/onnxruntime/test_decoder.py b/tests/onnxruntime/test_decoder.py
@@ -33,6 +33,7 @@
     CohereOnnxConfig,
     DeepSeekV3OnnxConfig,
     Gemma2OnnxConfig,
+    Gemma3OnnxConfig,
     GemmaOnnxConfig,
     GLMOnnxConfig,
     GPTOssOnnxConfig,
@@ -118,6 +119,8 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         SUPPORTED_ARCHITECTURES.append("gemma")
     if is_transformers_version(">=", str(Gemma2OnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("gemma2")
+    if is_transformers_version(">=", str(Gemma3OnnxConfig.MIN_TRANSFORMERS_VERSION)):
+        SUPPORTED_ARCHITECTURES.extend(["gemma3", "gemma3_text"])
     if is_transformers_version(">=", str(GLMOnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("glm")
     if is_transformers_version(">=", str(MPTOnnxConfig.MIN_TRANSFORMERS_VERSION)):

diff --git a/tests/onnxruntime/testing_utils.py b/tests/onnxruntime/testing_utils.py
@@ -66,6 +66,8 @@
     "flux": "optimum-internal-testing/tiny-random-flux",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gemma2": "hf-internal-testing/tiny-random-Gemma2ForCausalLM",
+    "gemma3": "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
+    "gemma3_text": "hf-internal-testing/tiny-random-Gemma3ForCausalLM",
     "glm": "hf-internal-testing/tiny-random-GlmForCausalLM",
     "gpt2": "hf-internal-testing/tiny-random-GPT2LMHeadModel",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",