@@ -3640,6 +3640,35 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
3640
3640
self .rotemb_attrs ["rescale_factors" ] = 1.0 / config .compression_ratio
3641
3641
3642
3642
3643
+ class SmolLM3Model (LlamaModel ):
3644
+ def __init__ (self , config , io_dtype , onnx_dtype , ep , cache_dir , extra_options ):
3645
+ super ().__init__ (config , io_dtype , onnx_dtype , ep , cache_dir , extra_options )
3646
+ self .layer_types = config .layer_types
3647
+ self .no_rope_layers = config .no_rope_layers
3648
+
3649
+ def make_attention (self , layer_id , attention , root_input , ** kwargs ):
3650
+ # SmolLM3 uses per-layer conditional RoPE and Sliding Window Attention.
3651
+ # So, we temporarily modify the model's attributes before calling the
3652
+ # base `make_attention` method, then restore them immediately after.
3653
+ original_use_rope = self .attention_attrs ["use_rope_in_attn" ]
3654
+ original_window_size = self .window_size
3655
+
3656
+ # Enable/disable RoPE for the current layer.
3657
+ self .attention_attrs ["use_rope_in_attn" ] = bool (self .no_rope_layers [layer_id ])
3658
+
3659
+ # Set the sliding window size for the current layer.
3660
+ assert self .layer_types [layer_id ] in {"sliding_attention" , "full_attention" }
3661
+ if self .layer_types [layer_id ] == "full_attention" :
3662
+ self .window_size = - 1
3663
+
3664
+ # Call the original `make_attention` with the temporarily-modified settings.
3665
+ super ().make_attention (layer_id , attention , root_input , ** kwargs )
3666
+
3667
+ # Restore original values
3668
+ self .attention_attrs ["use_rope_in_attn" ] = original_use_rope
3669
+ self .window_size = original_window_size
3670
+
3671
+
3643
3672
def check_extra_options (kv_pairs ):
3644
3673
"""
3645
3674
Check key-value pairs and set values correctly
@@ -3828,6 +3857,8 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
3828
3857
onnx_model = QwenModel (config , io_dtype , onnx_dtype , execution_provider , cache_dir , extra_options )
3829
3858
elif config .architectures [0 ] == "Qwen3ForCausalLM" :
3830
3859
onnx_model = Qwen3Model (config , io_dtype , onnx_dtype , execution_provider , cache_dir , extra_options )
3860
+ elif config .architectures [0 ] == "SmolLM3ForCausalLM" :
3861
+ onnx_model = SmolLM3Model (config , io_dtype , onnx_dtype , execution_provider , cache_dir , extra_options )
3831
3862
else :
3832
3863
raise NotImplementedError (f"The { hf_name } model is not currently supported." )
3833
3864
0 commit comments