@@ -13,7 +13,7 @@ class TensorNameMap:
1313 "transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx jais exaone
1414 "transformer.word_embeddings" , # falcon
1515 "word_embeddings" , # bloom
16- "model.embed_tokens" , # llama-hf nemotron
16+ "model.embed_tokens" , # llama-hf nemotron olmoe
1717 "tok_embeddings" , # llama-pth
1818 "embeddings.word_embeddings" , # bert nomic-bert
1919 "language_model.embedding.word_embeddings" , # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
5454 # Output
5555 MODEL_TENSOR .OUTPUT : (
5656 "embed_out" , # gptneox
57- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone
57+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
5858 "output" , # llama-pth bloom internlm2
5959 "word_embeddings_for_head" , # persimmon
6060 "lm_head.linear" , # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
6666 MODEL_TENSOR .OUTPUT_NORM : (
6767 "gpt_neox.final_layer_norm" , # gptneox
6868 "transformer.ln_f" , # gpt2 gpt-j falcon jais exaone
69- "model.norm" , # llama-hf baichuan internlm2
69+ "model.norm" , # llama-hf baichuan internlm2 olmoe
7070 "norm" , # llama-pth
7171 "transformer.norm_f" , # mpt dbrx
7272 "ln_f" , # refact bloom qwen gpt2
@@ -98,7 +98,7 @@ class TensorNameMap:
9898 "transformer.h.{bid}.input_layernorm" , # falcon7b
9999 "h.{bid}.input_layernorm" , # bloom
100100 "transformer.h.{bid}.ln_mlp" , # falcon40b
101- "model.layers.{bid}.input_layernorm" , # llama-hf nemotron
101+ "model.layers.{bid}.input_layernorm" , # llama-hf nemotron olmoe
102102 "layers.{bid}.attention_norm" , # llama-pth
103103 "language_model.encoder.layers.{bid}.input_layernorm" , # persimmon
104104 "model.layers.{bid}.ln1" , # yi
@@ -142,7 +142,7 @@ class TensorNameMap:
142142
143143 # Attention query
144144 MODEL_TENSOR .ATTN_Q : (
145- "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron
145+ "model.layers.{bid}.self_attn.q_proj" , # llama-hf nemotron olmoe
146146 "layers.{bid}.attention.wq" , # llama-pth
147147 "encoder.layer.{bid}.attention.self.query" , # bert
148148 "transformer.h.{bid}.attn.q_proj" , # gpt-j
@@ -154,7 +154,7 @@ class TensorNameMap:
154154
155155 # Attention key
156156 MODEL_TENSOR .ATTN_K : (
157- "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron
157+ "model.layers.{bid}.self_attn.k_proj" , # llama-hf nemotron olmoe
158158 "layers.{bid}.attention.wk" , # llama-pth
159159 "encoder.layer.{bid}.attention.self.key" , # bert
160160 "transformer.h.{bid}.attn.k_proj" , # gpt-j
@@ -167,7 +167,7 @@ class TensorNameMap:
167167
168168 # Attention value
169169 MODEL_TENSOR .ATTN_V : (
170- "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron
170+ "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe
171171 "layers.{bid}.attention.wv" , # llama-pth
172172 "encoder.layer.{bid}.attention.self.value" , # bert
173173 "transformer.h.{bid}.attn.v_proj" , # gpt-j
@@ -185,7 +185,7 @@ class TensorNameMap:
185185 "transformer.blocks.{bid}.attn.out_proj" , # mpt
186186 "transformer.h.{bid}.self_attention.dense" , # falcon
187187 "h.{bid}.self_attention.dense" , # bloom
188- "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron
188+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf nemotron olmoe
189189 "layers.{bid}.attention.wo" , # llama-pth
190190 "encoder.layer.{bid}.attention.output.dense" , # bert
191191 "transformer.h.{bid}.attn.out_proj" , # gpt-j
@@ -229,7 +229,7 @@ class TensorNameMap:
229229 "transformer.h.{bid}.ln_2" , # gpt2 refact qwen jais exaone
230230 "h.{bid}.post_attention_layernorm" , # bloom
231231 "transformer.blocks.{bid}.norm_2" , # mpt
232- "model.layers.{bid}.post_attention_layernorm" , # llama-hf nemotron
232+ "model.layers.{bid}.post_attention_layernorm" , # llama-hf nemotron olmoe
233233 "layers.{bid}.ffn_norm" , # llama-pth
234234 "language_model.encoder.layers.{bid}.post_attention_layernorm" , # persimmon
235235 "model.layers.{bid}.ln2" , # yi
@@ -253,7 +253,7 @@ class TensorNameMap:
253253 MODEL_TENSOR .FFN_GATE_INP : (
254254 "layers.{bid}.feed_forward.gate" , # mixtral
255255 "model.layers.{bid}.block_sparse_moe.gate" , # mixtral
256- "model.layers.{bid}.mlp.gate" , # qwen2moe
256+ "model.layers.{bid}.mlp.gate" , # qwen2moe olmoe
257257 "transformer.decoder_layer.{bid}.router" , # Grok
258258 "transformer.blocks.{bid}.ffn.router.layer" , # dbrx
259259 ),
@@ -295,7 +295,7 @@ class TensorNameMap:
295295 "layers.{bid}.feed_forward.experts.w3" , # mixtral (merged)
296296 "transformer.decoder_layer.{bid}.moe.linear_v" , # Grok (merged)
297297 "transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
298- "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe (merged)
298+ "model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
299299 ),
300300
301301 MODEL_TENSOR .FFN_UP_SHEXP : (
@@ -327,7 +327,7 @@ class TensorNameMap:
327327 "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
328328 "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
329329 "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
330- "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe (merged)
330+ "model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
331331 ),
332332
333333 MODEL_TENSOR .FFN_GATE_SHEXP : (
@@ -367,7 +367,7 @@ class TensorNameMap:
367367 "layers.{bid}.feed_forward.experts.w2" , # mixtral (merged)
368368 "transformer.decoder_layer.{bid}.moe.linear_1" , # Grok (merged)
369369 "transformer.blocks.{bid}.ffn.experts.mlp.w2" , # dbrx
370- "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe (merged)
370+ "model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
371371 ),
372372
373373 MODEL_TENSOR .FFN_DOWN_SHEXP : (
@@ -378,7 +378,7 @@ class TensorNameMap:
378378 MODEL_TENSOR .ATTN_Q_NORM : (
379379 "language_model.encoder.layers.{bid}.self_attention.q_layernorm" ,
380380 "model.layers.{bid}.self_attn.q_layernorm" , # persimmon
381- "model.layers.{bid}.self_attn.q_norm" , # cohere
381+ "model.layers.{bid}.self_attn.q_norm" , # cohere olmoe
382382 "transformer.blocks.{bid}.attn.q_ln" , # sea-lion
383383 "encoder.layer.{bid}.attention.self.layer_norm_q" , # jina-bert-v2
384384 "transformer.layers.{bid}.attn.q_norm" , # openelm
@@ -387,7 +387,7 @@ class TensorNameMap:
387387 MODEL_TENSOR .ATTN_K_NORM : (
388388 "language_model.encoder.layers.{bid}.self_attention.k_layernorm" ,
389389 "model.layers.{bid}.self_attn.k_layernorm" , # persimmon
390- "model.layers.{bid}.self_attn.k_norm" , # cohere
390+ "model.layers.{bid}.self_attn.k_norm" , # cohere olmoe
391391 "transformer.blocks.{bid}.attn.k_ln" , # sea-lion
392392 "encoder.layer.{bid}.attention.self.layer_norm_k" , # jina-bert-v2
393393 "transformer.layers.{bid}.attn.k_norm" , # openelm
0 commit comments