@@ -30,6 +30,7 @@ class TensorNameMap:
3030 "rwkv.embeddings" , # rwkv6
3131 "model.embeddings" , # rwkv7
3232 "model.word_embeddings" , # bailingmoe
33+ "language_model.model.embed_tokens" , # llama4
3334 ),
3435
3536 # Token type embeddings
@@ -67,6 +68,7 @@ class TensorNameMap:
6768 "output_layer" , # chatglm
6869 "head" , # rwkv
6970 "head.out" , # wavtokenizer
71+ "language_model.lm_head" , # llama4
7072 ),
7173
7274 # Output norm
@@ -89,6 +91,7 @@ class TensorNameMap:
8991 "rwkv.ln_out" , # rwkv6
9092 "model.ln_out" , # rwkv7
9193 "backbone.final_layer_norm" , # wavtokenizer
94+ "language_model.model.norm" , # llama4
9295 ),
9396
9497 # Rope frequencies
@@ -130,6 +133,7 @@ class TensorNameMap:
130133 "transformer.layers.{bid}.attn_norm" , # openelm
131134 "rwkv.blocks.{bid}.ln1" , # rwkv6
132135 "model.layers.{bid}.ln1" , # rwkv7
136+ "language_model.model.layers.{bid}.input_layernorm" , # llama4
133137 ),
134138
135139 # Attention norm 2
@@ -169,6 +173,7 @@ class TensorNameMap:
169173 "model.layers.{bid}.attention.wq" , # internlm2
170174 "transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
171175 "transformer.h.{bid}.attn.attention.q_proj" , # exaone
176+ "language_model.model.layers.{bid}.self_attn.q_proj" , # llama4
172177 ),
173178
174179 # Attention key
@@ -183,6 +188,7 @@ class TensorNameMap:
183188 "model.layers.{bid}.attention.wk" , # internlm2
184189 "transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
185190 "transformer.h.{bid}.attn.attention.k_proj" , # exaone
191+ "language_model.model.layers.{bid}.self_attn.k_proj" , # llama4
186192 ),
187193
188194 # Attention value
@@ -196,6 +202,7 @@ class TensorNameMap:
196202 "model.layers.{bid}.attention.wv" , # internlm2
197203 "transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
198204 "transformer.h.{bid}.attn.attention.v_proj" , # exaone
205+ "language_model.model.layers.{bid}.self_attn.v_proj" , # llama4
199206 ),
200207
201208 # Attention output
@@ -222,6 +229,7 @@ class TensorNameMap:
222229 "encoder.layers.{bid}.self_attention.dense" , # chatglm
223230 "transformer.layers.{bid}.attn.out_proj" , # openelm
224231 "transformer.h.{bid}.attn.attention.out_proj" , # exaone
232+ "language_model.model.layers.{bid}.self_attn.o_proj" , # llama4
225233 ),
226234
227235 # Attention output norm
@@ -259,6 +267,7 @@ class TensorNameMap:
259267 "transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
260268 "encoder.layers.{bid}.post_attention_layernorm" , # chatglm
261269 "transformer.layers.{bid}.ffn_norm" , # openelm
270+ "language_model.model.layers.{bid}.post_attention_layernorm" , # llama4
262271 ),
263272
264273 # Post feed-forward norm
@@ -278,6 +287,7 @@ class TensorNameMap:
278287 "transformer.decoder_layer.{bid}.router" , # Grok
279288 "transformer.blocks.{bid}.ffn.router.layer" , # dbrx
280289 "model.layers.{bid}.block_sparse_moe.router.layer" , # granitemoe
290+ "language_model.model.layers.{bid}.feed_forward.router" , # llama4
281291 ),
282292
283293 MODEL_TENSOR .FFN_GATE_INP_SHEXP : (
@@ -315,6 +325,7 @@ class TensorNameMap:
315325 "model.layers.{bid}.residual_mlp.w3" , # arctic
316326 "encoder.layers.{bid}.mlp.dense_h_to_4h" , # chatglm
317327 "transformer.h.{bid}.mlp.c_fc_1" , # exaone
328+ "language_model.model.layers.{bid}.feed_forward.up_proj" , # llama4
318329 ),
319330
320331 MODEL_TENSOR .FFN_UP_EXP : (
@@ -323,11 +334,13 @@ class TensorNameMap:
323334 "transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
324335 "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
325336 "model.layers.{bid}.block_sparse_moe.experts.w3" , # phimoe (merged)
337+ "language_model.model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
326338 ),
327339
328340 MODEL_TENSOR .FFN_UP_SHEXP : (
329341 "model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
330342 "model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
343+ "language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
331344 ),
332345
333346 # AWQ-activation gate
@@ -348,6 +361,7 @@ class TensorNameMap:
348361 "transformer.h.{bid}.mlp.linear_1" , # refact
349362 "model.layers.{bid}.residual_mlp.w1" , # arctic
350363 "transformer.h.{bid}.mlp.c_fc_0" , # exaone
364+ "language_model.model.layers.{bid}.feed_forward.gate_proj" , # llama4
351365 ),
352366
353367 MODEL_TENSOR .FFN_GATE_EXP : (
@@ -356,11 +370,13 @@ class TensorNameMap:
356370 "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
357371 "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
358372 "model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
373+ "language_model.model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
359374 ),
360375
361376 MODEL_TENSOR .FFN_GATE_SHEXP : (
362377 "model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
363378 "model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
379+ "language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
364380 ),
365381
366382 # Feed-forward down
@@ -389,6 +405,7 @@ class TensorNameMap:
389405 "encoder.layer.{bid}.mlp.down_layer" , # jina-bert-v2
390406 "encoder.layers.{bid}.mlp.dense_4h_to_h" , # chatglm
391407 "model.layers.h.{bid}.mlp.c_proj" , # exaone
408+ "language_model.model.layers.{bid}.feed_forward.down_proj" , # llama4
392409 ),
393410
394411 MODEL_TENSOR .FFN_DOWN_EXP : (
@@ -398,11 +415,13 @@ class TensorNameMap:
398415 "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
399416 "model.layers.{bid}.block_sparse_moe.output_linear" , # granitemoe
400417 "model.layers.{bid}.block_sparse_moe.experts.w2" , # phimoe (merged)
418+ "language_model.model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
401419 ),
402420
403421 MODEL_TENSOR .FFN_DOWN_SHEXP : (
404422 "model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
405423 "model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
424+ "language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
406425 ),
407426
408427 MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments