Add Nemotron Support (#36)

Abdennacer-Badaoui · web-flow · commit 68f201a087d5 · 2025-09-03T11:46:35.000+02:00
Add Nemotron Support
diff --git a/docs/source/onnx/overview.mdx b/docs/source/onnx/overview.mdx
@@ -86,6 +86,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - MPNet
 - MT5
 - Musicgen (text-conditional only)
+- Nemotron
 - Nystromformer
 - OLMo
 - OLMo2
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -507,6 +507,12 @@ class GemmaOnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.38.0")
 
 
+@register_tasks_manager_onnx("nemotron", *COMMON_TEXT_GENERATION_TASKS)
+class NemotronOnnxConfig(GemmaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.48.0")  # More stable version than 4.44.0
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA
+
+
 @register_tasks_manager_onnx("granite", *COMMON_TEXT_GENERATION_TASKS)
 class GraniteOnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
@@ -85,6 +85,7 @@
     "internlm2",
     "llama",
     "mistral",
+    "nemotron",
     "phi",
     "phi3",
     "qwen2",
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -204,7 +204,7 @@ def __init__(
                 "To re-export your model, simply set `export=True` as in `from_pretrained(..., export=True, use_cache=True)`."
             )
 
-        if self.config.model_type == "gemma":
+        if self.config.model_type in {"gemma", "nemotron"}:
             self.embed_size_per_head = self.config.head_dim
         elif self.config.model_type == "gpt_bigcode":
             self.embed_size_per_head = self.config.hidden_size // self.config.num_attention_heads * 2
@@ -223,6 +223,7 @@ def __init__(
             "helium",
             "mistral",
             "llama",
+            "nemotron",
             "qwen2",
             "qwen3",
             "qwen3_moe",
diff --git a/tests/exporters/onnx/utils_tests.py b/tests/exporters/onnx/utils_tests.py
@@ -142,6 +142,7 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "lewtun/tiny-random-mt5",
     "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration",
+    "nemotron": "badaoui/tiny-random-NemotronForCausalLM",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "olmo": "hf-internal-testing/tiny-random-OlmoForCausalLM",
     "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM",
diff --git a/tests/onnxruntime/test_decoder.py b/tests/onnxruntime/test_decoder.py
@@ -37,6 +37,7 @@
     HeliumOnnxConfig,
     InternLM2OnnxConfig,
     MPTOnnxConfig,
+    NemotronOnnxConfig,
     Olmo2OnnxConfig,
     OlmoOnnxConfig,
     OPTOnnxConfig,
@@ -109,6 +110,8 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         SUPPORTED_ARCHITECTURES.append("gemma")
     if is_transformers_version(">=", str(MPTOnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("mpt")
+    if is_transformers_version(">=", str(NemotronOnnxConfig.MIN_TRANSFORMERS_VERSION)):
+        SUPPORTED_ARCHITECTURES.append("nemotron")
     if is_transformers_version(">=", str(GraniteOnnxConfig.MIN_TRANSFORMERS_VERSION)):
         SUPPORTED_ARCHITECTURES.append("granite")
     if is_transformers_version(">=", str(HeliumOnnxConfig.MIN_TRANSFORMERS_VERSION)):
@@ -199,6 +202,15 @@ def test_find_untested_architectures(self):
         transformers_architectures = set(CONFIG_MAPPING_NAMES.keys())
         onnx_architectures = set(TasksManager.get_supported_model_type_for_task(task=self.TASK, exporter="onnx"))
         supported_architectures = onnx_architectures & transformers_architectures
+
+        if "nemotron" in supported_architectures and is_transformers_version(
+            "<=", str(NemotronOnnxConfig.MIN_TRANSFORMERS_VERSION)
+        ):
+            # Nemotron was introduced in Transformers 4.44.0, but it has some issues. Specifically, it did not properly handle legacy cache formats (Lists/Cache), and it also did not return past_key_values when use_cache=True.
+            # We are using its 4.48.0 version, which is more stable.
+            # So we remove it from the list of supported architectures in the versions before 4.48.0.
+            supported_architectures.remove("nemotron")
+
         untested_architectures = supported_architectures - tested_architectures
 
         if len(untested_architectures) > 0:
diff --git a/tests/onnxruntime/testing_utils.py b/tests/onnxruntime/testing_utils.py
@@ -99,6 +99,7 @@
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "lewtun/tiny-random-mt5",
+    "nemotron": "badaoui/tiny-random-NemotronForCausalLM",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "olmo": "katuni4ka/tiny-random-olmo-hf",
     "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM",