Add Gemma2 support (#32)

echarlaix · IlyasMoutawwakil · web-flow · commit 4f78a8395c55 · 2025-09-18T10:00:03.000+02:00
Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
Co-authored-by: IlyasMoutawwakil &lt;moutawwakil.ilyas.tsi@gmail.com&gt;
diff --git a/docs/source/onnx/overview.mdx b/docs/source/onnx/overview.mdx
@@ -52,6 +52,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - ESM
 - Falcon
 - Flaubert
+- Gemma
+- Gemma 2
 - GLM
 - GPT-2
 - GPT-BigCode
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -512,6 +512,16 @@ class GemmaOnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.38.0")
 
 
+@register_tasks_manager_onnx("gemma2", *[*COMMON_TEXT_GENERATION_TASKS, "text-classification"])
+class Gemma2OnnxConfig(TextDecoderOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
+    # Gemma 2 was added in transformers v4.42 using HybridCache
+    # (tuple of past_key_values never supported), DynamicCache since v4.53
+    MIN_TRANSFORMERS_VERSION = version.parse("4.53.0")
+
+
 @register_tasks_manager_onnx("gpt_oss", *COMMON_TEXT_GENERATION_TASKS)
 class GPTOssOnnxConfig(GemmaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.55.0")
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -747,6 +747,7 @@ def _from_pretrained(
 
         # Important: for encoder-decoder models used with CausalLM, we need to set the is_decoder flag to True
         # and the is_encoder_decoder flag to False. This is needed for the model to work correctly with generation logic.
+        config.use_cache = use_cache
         if hasattr(config, "is_decoder"):
             config.is_decoder = True
         if hasattr(config, "is_encoder_decoder"):
@@ -770,7 +771,8 @@ def _from_pretrained(
                 generation_config = GenerationConfig.from_model_config(config)
 
         generation_config.use_cache = use_cache
-        config.use_cache = use_cache
+        if hasattr(generation_config, "cache_implementation"):
+            generation_config.cache_implementation = None
 
         if is_transformers_version(">=", "4.45.0"):
             misplaced_generation_parameters = config._get_non_default_generation_parameters()
diff --git a/tests/exporters/onnx/utils_tests.py b/tests/exporters/onnx/utils_tests.py
@@ -103,6 +103,7 @@
     },
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "gemma2": "hf-internal-testing/tiny-random-Gemma2ForCausalLM",
     "glm": "hf-internal-testing/tiny-random-GlmForCausalLM",
     "glpn": "hf-internal-testing/tiny-random-GLPNModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
diff --git a/tests/onnxruntime/test_decoder.py b/tests/onnxruntime/test_decoder.py
@@ -32,6 +32,7 @@
     BloomOnnxConfig,
     CohereOnnxConfig,
     DeepSeekV3OnnxConfig,
+    Gemma2OnnxConfig,
     GemmaOnnxConfig,
     GLMOnnxConfig,
     GPTOssOnnxConfig,
@@ -113,6 +114,8 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         SUPPORTED_ARCHITECTURES.append("qwen2")
     if is_transformers_version(">=", str(GemmaOnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("gemma")
+    if is_transformers_version(">=", str(Gemma2OnnxConfig.MIN_TRANSFORMERS_VERSION)):
+        SUPPORTED_ARCHITECTURES.append("gemma2")
     if is_transformers_version(">=", str(GLMOnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("glm")
     if is_transformers_version(">=", str(MPTOnnxConfig.MIN_TRANSFORMERS_VERSION)):
@@ -220,6 +223,12 @@ def test_find_untested_architectures(self):
             # So we remove it from the list of supported architectures in the versions before 4.48.0.
             supported_architectures.remove("nemotron")
 
+        if "gemma2" in supported_architectures and is_transformers_version(
+            "<", str(Gemma2OnnxConfig.MIN_TRANSFORMERS_VERSION)
+        ):
+            # Gemma 2 was added in transformers v4.42 using HybridCache (tuple of past_key_values never supported), DynamicCache since v4.53
+            supported_architectures.remove("gemma2")
+
         untested_architectures = supported_architectures - tested_architectures
 
         if len(untested_architectures) > 0:
diff --git a/tests/onnxruntime/testing_utils.py b/tests/onnxruntime/testing_utils.py
@@ -68,6 +68,7 @@
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "flux": "optimum-internal-testing/tiny-random-flux",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "gemma2": "hf-internal-testing/tiny-random-Gemma2ForCausalLM",
     "glm": "hf-internal-testing/tiny-random-GlmForCausalLM",
     "gpt2": "hf-internal-testing/tiny-random-GPT2LMHeadModel",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",