@@ -68,7 +68,7 @@ class TensorNameMap:
6868 "output_layer" , # chatglm
6969 "head" , # rwkv
7070 "head.out" , # wavtokenizer
71- "language_model. lm_head" , # llama4
71+ "lm_head" , # llama4
7272 ),
7373
7474 # Output norm
@@ -91,7 +91,7 @@ class TensorNameMap:
9191 "rwkv.ln_out" , # rwkv6
9292 "model.ln_out" , # rwkv7
9393 "backbone.final_layer_norm" , # wavtokenizer
94- "language_model. model.norm" , # llama4
94+ "model.norm" , # llama4
9595 ),
9696
9797 # Rope frequencies
@@ -133,7 +133,7 @@ class TensorNameMap:
133133 "transformer.layers.{bid}.attn_norm" , # openelm
134134 "rwkv.blocks.{bid}.ln1" , # rwkv6
135135 "model.layers.{bid}.ln1" , # rwkv7
136- "language_model. model.layers.{bid}.input_layernorm" , # llama4
136+ "model.layers.{bid}.input_layernorm" , # llama4
137137 ),
138138
139139 # Attention norm 2
@@ -173,7 +173,7 @@ class TensorNameMap:
173173 "model.layers.{bid}.attention.wq" , # internlm2
174174 "transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
175175 "transformer.h.{bid}.attn.attention.q_proj" , # exaone
176- "language_model. model.layers.{bid}.self_attn.q_proj" , # llama4
176+ "model.layers.{bid}.self_attn.q_proj" , # llama4
177177 ),
178178
179179 # Attention key
@@ -188,7 +188,7 @@ class TensorNameMap:
188188 "model.layers.{bid}.attention.wk" , # internlm2
189189 "transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
190190 "transformer.h.{bid}.attn.attention.k_proj" , # exaone
191- "language_model. model.layers.{bid}.self_attn.k_proj" , # llama4
191+ "model.layers.{bid}.self_attn.k_proj" , # llama4
192192 ),
193193
194194 # Attention value
@@ -202,7 +202,7 @@ class TensorNameMap:
202202 "model.layers.{bid}.attention.wv" , # internlm2
203203 "transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
204204 "transformer.h.{bid}.attn.attention.v_proj" , # exaone
205- "language_model. model.layers.{bid}.self_attn.v_proj" , # llama4
205+ "model.layers.{bid}.self_attn.v_proj" , # llama4
206206 ),
207207
208208 # Attention output
@@ -229,7 +229,7 @@ class TensorNameMap:
229229 "encoder.layers.{bid}.self_attention.dense" , # chatglm
230230 "transformer.layers.{bid}.attn.out_proj" , # openelm
231231 "transformer.h.{bid}.attn.attention.out_proj" , # exaone
232- "language_model. model.layers.{bid}.self_attn.o_proj" , # llama4
232+ "model.layers.{bid}.self_attn.o_proj" , # llama4
233233 ),
234234
235235 # Attention output norm
@@ -268,7 +268,7 @@ class TensorNameMap:
268268 "transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
269269 "encoder.layers.{bid}.post_attention_layernorm" , # chatglm
270270 "transformer.layers.{bid}.ffn_norm" , # openelm
271- "language_model. model.layers.{bid}.post_attention_layernorm" , # llama4
271+ "model.layers.{bid}.post_attention_layernorm" , # llama4
272272 ),
273273
274274 # Post feed-forward norm
@@ -289,7 +289,7 @@ class TensorNameMap:
289289 "transformer.decoder_layer.{bid}.router" , # Grok
290290 "transformer.blocks.{bid}.ffn.router.layer" , # dbrx
291291 "model.layers.{bid}.block_sparse_moe.router.layer" , # granitemoe
292- "language_model. model.layers.{bid}.feed_forward.router" , # llama4
292+ "model.layers.{bid}.feed_forward.router" , # llama4
293293 "encoder.layers.{bid}.mlp.router.layer" , # nomic-bert-moe
294294 ),
295295
@@ -329,7 +329,7 @@ class TensorNameMap:
329329 "model.layers.{bid}.residual_mlp.w3" , # arctic
330330 "encoder.layers.{bid}.mlp.dense_h_to_4h" , # chatglm
331331 "transformer.h.{bid}.mlp.c_fc_1" , # exaone
332- "language_model. model.layers.{bid}.feed_forward.up_proj" , # llama4
332+ "model.layers.{bid}.feed_forward.up_proj" , # llama4
333333 ),
334334
335335 MODEL_TENSOR .FFN_UP_EXP : (
@@ -338,14 +338,14 @@ class TensorNameMap:
338338 "transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
339339 "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
340340 "model.layers.{bid}.block_sparse_moe.experts.w3" , # phimoe (merged)
341- "language_model. model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
341+ "model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
342342 "encoder.layers.{bid}.mlp.experts.mlp.w1" , # nomic-bert-moe
343343 ),
344344
345345 MODEL_TENSOR .FFN_UP_SHEXP : (
346- "model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
347- "model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
348- "language_model. model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
346+ "model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
347+ "model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
348+ "model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
349349 ),
350350
351351 # AWQ-activation gate
@@ -366,22 +366,22 @@ class TensorNameMap:
366366 "transformer.h.{bid}.mlp.linear_1" , # refact
367367 "model.layers.{bid}.residual_mlp.w1" , # arctic
368368 "transformer.h.{bid}.mlp.c_fc_0" , # exaone
369- "language_model. model.layers.{bid}.feed_forward.gate_proj" , # llama4
369+ "model.layers.{bid}.feed_forward.gate_proj" , # llama4
370370 ),
371371
372372 MODEL_TENSOR .FFN_GATE_EXP : (
373- "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
374- "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
375- "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
376- "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
377- "model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
378- "language_model. model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
373+ "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
374+ "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
375+ "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
376+ "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
377+ "model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
378+ "model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
379379 ),
380380
381381 MODEL_TENSOR .FFN_GATE_SHEXP : (
382- "model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
383- "model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
384- "language_model. model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
382+ "model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
383+ "model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
384+ "model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
385385 ),
386386
387387 # Feed-forward down
@@ -410,7 +410,7 @@ class TensorNameMap:
410410 "encoder.layer.{bid}.mlp.down_layer" , # jina-bert-v2
411411 "encoder.layers.{bid}.mlp.dense_4h_to_h" , # chatglm
412412 "model.layers.h.{bid}.mlp.c_proj" , # exaone
413- "language_model. model.layers.{bid}.feed_forward.down_proj" , # llama4
413+ "model.layers.{bid}.feed_forward.down_proj" , # llama4
414414 ),
415415
416416 MODEL_TENSOR .FFN_DOWN_EXP : (
@@ -420,15 +420,15 @@ class TensorNameMap:
420420 "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
421421 "model.layers.{bid}.block_sparse_moe.output_linear" , # granitemoe
422422 "model.layers.{bid}.block_sparse_moe.experts.w2" , # phimoe (merged)
423- "language_model. model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
423+ "model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
424424 "encoder.layers.{bid}.mlp.experts.mlp.w2" , # nomic-bert-moe
425425 ),
426426
427427 MODEL_TENSOR .FFN_DOWN_SHEXP : (
428- "model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
429- "model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
430- "language_model. model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
431- "model.layers.{bid}.shared_mlp.output_linear" , # granitemoe
428+ "model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
429+ "model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
430+ "model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
431+ "model.layers.{bid}.shared_mlp.output_linear" , # granitemoe
432432 ),
433433
434434 MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments