Add support for smollm3 (microsoft#1666)

xenova · kunal-vaishnavi · LorenRd · web-flow · commit 5c15fe2d3cdb · 2025-08-17T20:24:46.000Z
cc @guschmue --------- Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Co-authored-by: Lorenzo Rondán <lrondan95@gmail.com>
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ See documentation at https://onnxruntime.ai/docs/genai.
 
 |Support matrix|Supported now|Under development|On the roadmap|
 | -------------- | ------------- | ----------------- | -------------- |
-| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Gemma <br/> Granite <br/> Llama * <br/> Mistral + <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen | Whisper | Stable diffusion |
+| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Gemma <br/> Granite <br/> Llama * <br/> Mistral + <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen <br/> SmolLM3 | Whisper | Stable diffusion |
 |API| Python <br/>C# <br/>C/C++ <br/> Java ^ |Objective-C||
 |Platform| Linux <br/> Windows <br/>Mac ^ <br/>Android ^  ||iOS |||
 |Architecture|x86 <br/> x64 <br/> Arm64 ~ ||||
diff --git a/src/models/model_type.h b/src/models/model_type.h
@@ -12,7 +12,7 @@ namespace Generators {
 struct ModelType {
   inline static bool IsLLM(const std::string& model_type) {
     // Large-language model (LLM)
-    static constexpr std::array<std::string_view, 18> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};
+    static constexpr std::array<std::string_view, 19> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3", "smollm3"};
     return std::find(LLM.begin(), LLM.end(), model_type) != LLM.end();
   }
 
diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
@@ -42,6 +42,7 @@ The tool currently supports the following model architectures.
 - Nemotron
 - Phi
 - Qwen
+- SmolLM3
 
 It is intended for supporting the latest, popular state-of-the-art models.
 
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -3640,6 +3640,35 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             self.rotemb_attrs["rescale_factors"] = 1.0 / config.compression_ratio
 
 
+class SmolLM3Model(LlamaModel):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+        self.layer_types = config.layer_types
+        self.no_rope_layers = config.no_rope_layers
+
+    def make_attention(self, layer_id, attention, root_input, **kwargs):
+        # SmolLM3 uses per-layer conditional RoPE and Sliding Window Attention.
+        # So, we temporarily modify the model's attributes before calling the
+        # base `make_attention` method, then restore them immediately after.
+        original_use_rope = self.attention_attrs["use_rope_in_attn"]
+        original_window_size = self.window_size
+
+        # Enable/disable RoPE for the current layer.
+        self.attention_attrs["use_rope_in_attn"] = bool(self.no_rope_layers[layer_id])
+
+        # Set the sliding window size for the current layer.
+        assert self.layer_types[layer_id] in {"sliding_attention", "full_attention"}
+        if self.layer_types[layer_id] == "full_attention":
+            self.window_size = -1
+
+        # Call the original `make_attention` with the temporarily-modified settings.
+        super().make_attention(layer_id, attention, root_input, **kwargs)
+
+        # Restore original values
+        self.attention_attrs["use_rope_in_attn"] = original_use_rope
+        self.window_size = original_window_size
+
+
 def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
@@ -3828,6 +3857,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
             onnx_model = QwenModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Qwen3ForCausalLM":
             onnx_model = Qwen3Model(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
+        elif config.architectures[0] == "SmolLM3ForCausalLM":
+            onnx_model = SmolLM3Model(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
         else:
             raise NotImplementedError(f"The {hf_name} model is not currently supported.")
 

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ namespace Generators {`
`12`	`12`	`struct ModelType {`
`13`	`13`	`inline static bool IsLLM(const std::string& model_type) {`
`14`	`14`	`// Large-language model (LLM)`
`15`		`- static constexpr std::array<std::string_view, 18> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3"};`
	`15`	`+ static constexpr std::array<std::string_view, 19> LLM = {"chatglm", "decoder", "ernie4_5", "gemma", "gemma2", "gemma3_text", "gpt2", "granite", "llama", "mistral", "nemotron", "olmo", "phi", "phimoe", "phi3", "phi3small", "qwen2", "qwen3", "smollm3"};`
`16`	`16`	`return std::find(LLM.begin(), LLM.end(), model_type) != LLM.end();`
`17`	`17`	`}`
`18`	`18`