@@ -32,6 +32,7 @@ class TensorNameMap:
3232 "model.word_embeddings" , # bailingmoe
3333 "language_model.model.embed_tokens" , # llama4
3434 "encoder" , # neobert
35+ "model.text_model.embed_tokens.weight" , # smoldocling
3536 ),
3637
3738 # Token type embeddings
@@ -63,7 +64,7 @@ class TensorNameMap:
6364 MODEL_TENSOR .OUTPUT : (
6465 "embed_out" , # gptneox
6566 "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
66- "output" , # llama-pth bloom internlm2
67+ "output" , # llama-pth bloom internlm2 smoldocling
6768 "word_embeddings_for_head" , # persimmon
6869 "lm_head.linear" , # phi2
6970 "output_layer" , # chatglm
@@ -93,6 +94,7 @@ class TensorNameMap:
9394 "model.ln_out" , # rwkv7
9495 "backbone.final_layer_norm" , # wavtokenizer
9596 "model.norm" , # llama4
97+ "output_norm" , # smoldocling
9698 ),
9799
98100 # Rope frequencies
@@ -136,6 +138,7 @@ class TensorNameMap:
136138 "model.layers.{bid}.ln1" , # rwkv7
137139 "model.layers.{bid}.input_layernorm" , # llama4
138140 "transformer_encoder.{bid}.attention_norm" , # neobert
141+ "blk.{bid}.attn_norm" , # smoldocling
139142 ),
140143
141144 # Attention norm 2
@@ -179,6 +182,7 @@ class TensorNameMap:
179182 "transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
180183 "transformer.h.{bid}.attn.attention.q_proj" , # exaone
181184 "model.layers.{bid}.self_attn.q_proj" , # llama4
185+ "blk.{bid}.attn_q" , # smoldocling
182186 ),
183187
184188 # Attention key
@@ -195,6 +199,7 @@ class TensorNameMap:
195199 "transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
196200 "transformer.h.{bid}.attn.attention.k_proj" , # exaone
197201 "model.layers.{bid}.self_attn.k_proj" , # llama4
202+ "blk.{bid}.attn_k" , # smoldocling
198203 ),
199204
200205 # Attention value
@@ -210,6 +215,8 @@ class TensorNameMap:
210215 "transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
211216 "transformer.h.{bid}.attn.attention.v_proj" , # exaone
212217 "model.layers.{bid}.self_attn.v_proj" , # llama4
218+ "blk.{bid}.attn_v" , # smoldocling
219+
213220 ),
214221
215222 # Attention output
@@ -240,6 +247,7 @@ class TensorNameMap:
240247 "transformer.h.{bid}.attn.attention.out_proj" , # exaone
241248 "model.layers.{bid}.self_attn.o_proj" , # llama4
242249 "transformer_encoder.{bid}.wo" , # neobert
250+ "blk.{bid}.attn_output" , # smoldocling
243251 ),
244252
245253 # Attention output norm
@@ -249,6 +257,7 @@ class TensorNameMap:
249257 "encoder.layers.{bid}.norm1" , # nomic-bert
250258 "transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
251259 "transformer.blocks.{bid}.norm_attn_norm.norm_2" , # dbrx
260+ "blk.{bid}.attn_norm" , # smoldocling
252261 ),
253262
254263 MODEL_TENSOR .ATTN_POST_NORM : (
@@ -281,6 +290,7 @@ class TensorNameMap:
281290 "transformer.layers.{bid}.ffn_norm" , # openelm
282291 "model.layers.{bid}.post_attention_layernorm" , # llama4
283292 "transformer_encoder.{bid}.ffn_norm" , # neobert
293+ "blk.{bid}.ffn_norm" , # smoldocling
284294 ),
285295
286296 # Post feed-forward norm
@@ -346,6 +356,7 @@ class TensorNameMap:
346356 "transformer.h.{bid}.mlp.c_fc_1" , # exaone
347357 "model.layers.{bid}.feed_forward.up_proj" , # llama4
348358 "transformer_encoder.{bid}.ffn.w12" , # neobert
359+ "blk.{bid}.ffn_up" , # smoldocling
349360 ),
350361
351362 MODEL_TENSOR .FFN_UP_EXP : (
@@ -383,6 +394,8 @@ class TensorNameMap:
383394 "model.layers.{bid}.residual_mlp.w1" , # arctic
384395 "transformer.h.{bid}.mlp.c_fc_0" , # exaone
385396 "model.layers.{bid}.feed_forward.gate_proj" , # llama4
397+ "blk.{bid}.ffn_gate" , # smoldocling
398+
386399 ),
387400
388401 MODEL_TENSOR .FFN_GATE_EXP : (
@@ -429,6 +442,8 @@ class TensorNameMap:
429442 "model.layers.h.{bid}.mlp.c_proj" , # exaone
430443 "model.layers.{bid}.feed_forward.down_proj" , # llama4
431444 "transformer_encoder.{bid}.ffn.w3" , # neobert
445+ "blk.{bid}.ffn_down" , # smoldocling
446+
432447 ),
433448
434449 MODEL_TENSOR .FFN_DOWN_EXP : (
0 commit comments