[Model builder] Add support for Ernie 4.5 models (#1608)

xenova · kunal-vaishnavi · web-flow · commit 18fb095e28de · 2025-07-07T23:44:06.000Z
Enables exporting the new Ernie 4.5 models via onnxruntime-genai: https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT I've uploaded the converted model to https://huggingface.co/onnx-community/ERNIE-4.5-0.3B-ONNX. Currently only supports the non-MoE version... but maybe someone can help with the MoE version: https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-PT --- Models tested and validated with python ort & [transformers.js](huggingface/transformers.js#1354): ```py from transformers import AutoConfig, AutoTokenizer import onnxruntime import numpy as np # 1. Load config, processor, and model path_to_model = "./path/to/model" config = AutoConfig.from_pretrained("baidu/ERNIE-4.5-0.3B-PT", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("baidu/ERNIE-4.5-0.3B-PT", trust_remote_code=True) decoder_session = onnxruntime.InferenceSession(f"{path_to_model}/model.onnx") ## Set config values num_key_value_heads = config.num_key_value_heads head_dim = config.head_dim num_hidden_layers = config.num_hidden_layers eos_token_id = config.eos_token_id # 2. Prepare inputs ## Create input messages messages = [ { "role": "system", "content": "You are a helpful assistant." }, { "role": "user", "content": "Write me a poem about Machine Learning." }, ] ## Apply tokenizer inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np") ## Prepare decoder inputs batch_size = inputs['input_ids'].shape[0] past_key_values = { f'past_key_values.{layer}.{kv}': np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32) for layer in range(num_hidden_layers) for kv in ('key', 'value') } input_ids = inputs['input_ids'] position_ids = np.tile(np.arange(1, input_ids.shape[-1] + 1), (batch_size, 1)) attention_mask = np.ones_like(input_ids, dtype=np.int64) # 3. Generation loop max_new_tokens = 1024 generated_tokens = np.array([[]], dtype=np.int64) for i in range(max_new_tokens): logits, *present_key_values = decoder_session.run(None, dict( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, **past_key_values, )) ## Update values for next generation loop input_ids = logits[:, -1].argmax(-1, keepdims=True) attention_mask = np.concatenate([attention_mask, np.ones_like(input_ids, dtype=np.int64)], axis=-1) position_ids = position_ids[:, -1:] + 1 for j, key in enumerate(past_key_values): past_key_values[key] = present_key_values[j] generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1) if (input_ids == eos_token_id).all(): break ## (Optional) Streaming print(tokenizer.decode(input_ids[0]), end='', flush=True) print() # 4. Output result print(tokenizer.batch_decode(generated_tokens)) ``` --------- Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ See documentation at https://onnxruntime.ai/docs/genai.
 
 |Support matrix|Supported now|Under development|On the roadmap|
 | -------------- | ------------- | ----------------- | -------------- |
-| Model architectures | DeepSeek <br/> Gemma <br/> Llama * <br/> Mistral + <br/> Phi (language + vision) <br/> Qwen <br/> Nemotron <br/> Granite <br/> AMD OLMo | Whisper | Stable diffusion |
+| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Gemma <br/> Granite <br/> Llama * <br/> Mistral + <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen | Whisper | Stable diffusion |
 |API| Python <br/>C# <br/>C/C++ <br/> Java ^ |Objective-C||
 |Platform| Linux <br/> Windows <br/>Mac ^ <br/>Android ^  ||iOS |||
 |Architecture|x86 <br/> x64 <br/> Arm64 ~ ||||
diff --git a/src/models/model_type.h b/src/models/model_type.h
@@ -12,7 +12,7 @@ namespace Generators {
 struct ModelType {
   inline static bool IsLLM(const std::string& model_type) {
     // Large-language model (LLM)
-    static constexpr std::array<std::string_view, 17> LLM = {"chatglm", "decoder", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};
+    static constexpr std::array<std::string_view, 18> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};
     return std::find(LLM.begin(), LLM.end(), model_type) != LLM.end();
   }
 
diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
@@ -31,15 +31,17 @@ This folder contains the model builder for quickly creating optimized and quanti
 
 The tool currently supports the following model architectures.
 
+- AMD OLMo
 - ChatGLM
+- DeepSeek
+- ERNIE 4.5
 - Gemma
 - Granite
-- LLaMA
+- Llama
 - Mistral
 - Nemotron
 - Phi
 - Qwen
-- AMD OLMo
 
 It is intended for supporting the latest, popular state-of-the-art models.
 
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -3619,6 +3619,20 @@ def make_rotary_embedding_caches(self, **kwargs):
         return super().make_rotary_embedding_caches(cos_cache_name=cos_cache_name, sin_cache_name=sin_cache_name)
 
 
+class ErnieModel(MistralModel):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+
+        # Ernie uses interleaved rotary position embeddings.
+        self.rotemb_attrs["interleaved"] = 1
+
+        # Ernie uses a `compression_ratio` for its RoPE scaling.
+        # The original RoPE logic in ernie is: position_ids / compression_ratio,
+        # which is equivalent to scaling the frequencies (inv_freq) by 1 / compression_ratio.
+        if hasattr(config, "compression_ratio") and config.compression_ratio != 1.0:
+            self.rotemb_attrs["rescale_factors"] = 1.0 / config.compression_ratio
+
+
 def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
@@ -3739,6 +3753,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             # Quantized ChatGLM model has ChatGLMForConditionalGeneration as architecture whereas HF model as the latter
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
+        elif config.architectures[0] == "Ernie4_5_ForCausalLM":
+            onnx_model = ErnieModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "GemmaForCausalLM":
             onnx_model = GemmaModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Gemma2ForCausalLM":

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ namespace Generators {`
`12`	`12`	`struct ModelType {`
`13`	`13`	`inline static bool IsLLM(const std::string& model_type) {`
`14`	`14`	`// Large-language model (LLM)`
`15`		`- static constexpr std::array<std::string_view, 17> LLM = {"chatglm", "decoder", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};`
	`15`	`+ static constexpr std::array<std::string_view, 18> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};`
`16`	`16`	`return std::find(LLM.begin(), LLM.end(), model_type) != LLM.end();`
`17`	`17`	`}`
`18`	`18`