@@ -13,7 +13,7 @@ class TensorNameMap:
1313            "transformer.wte" ,                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone 
1414            "transformer.word_embeddings" ,               # falcon 
1515            "word_embeddings" ,                           # bloom 
16-             "model.embed_tokens" ,                        # llama-hf nemotron 
16+             "model.embed_tokens" ,                        # llama-hf nemotron olmoe  
1717            "tok_embeddings" ,                            # llama-pth 
1818            "embeddings.word_embeddings" ,                # bert nomic-bert 
1919            "language_model.embedding.word_embeddings" ,  # persimmon 
@@ -54,7 +54,7 @@ class TensorNameMap:
5454        # Output 
5555        MODEL_TENSOR .OUTPUT : (
5656            "embed_out" ,                 # gptneox 
57-             "lm_head" ,                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone 
57+             "lm_head" ,                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe  
5858            "output" ,                    # llama-pth bloom internlm2 
5959            "word_embeddings_for_head" ,  # persimmon 
6060            "lm_head.linear" ,            # phi2 
@@ -66,7 +66,7 @@ class TensorNameMap:
6666        MODEL_TENSOR .OUTPUT_NORM : (
6767            "gpt_neox.final_layer_norm" ,               # gptneox 
6868            "transformer.ln_f" ,                        # gpt2 gpt-j falcon jais exaone 
69-             "model.norm" ,                              # llama-hf baichuan internlm2 
69+             "model.norm" ,                              # llama-hf baichuan internlm2 olmoe  
7070            "norm" ,                                    # llama-pth 
7171            "transformer.norm_f" ,                      # mpt dbrx 
7272            "ln_f" ,                                    # refact bloom qwen gpt2 
@@ -98,7 +98,7 @@ class TensorNameMap:
9898            "transformer.h.{bid}.input_layernorm" ,                  # falcon7b 
9999            "h.{bid}.input_layernorm" ,                              # bloom 
100100            "transformer.h.{bid}.ln_mlp" ,                           # falcon40b 
101-             "model.layers.{bid}.input_layernorm" ,                   # llama-hf nemotron 
101+             "model.layers.{bid}.input_layernorm" ,                   # llama-hf nemotron olmoe  
102102            "layers.{bid}.attention_norm" ,                          # llama-pth 
103103            "language_model.encoder.layers.{bid}.input_layernorm" ,  # persimmon 
104104            "model.layers.{bid}.ln1" ,                               # yi 
@@ -142,7 +142,7 @@ class TensorNameMap:
142142
143143        # Attention query 
144144        MODEL_TENSOR .ATTN_Q : (
145-             "model.layers.{bid}.self_attn.q_proj" ,                       # llama-hf nemotron 
145+             "model.layers.{bid}.self_attn.q_proj" ,                       # llama-hf nemotron olmoe  
146146            "layers.{bid}.attention.wq" ,                                 # llama-pth 
147147            "encoder.layer.{bid}.attention.self.query" ,                  # bert 
148148            "transformer.h.{bid}.attn.q_proj" ,                           # gpt-j 
@@ -154,7 +154,7 @@ class TensorNameMap:
154154
155155        # Attention key 
156156        MODEL_TENSOR .ATTN_K : (
157-             "model.layers.{bid}.self_attn.k_proj" ,                     # llama-hf nemotron 
157+             "model.layers.{bid}.self_attn.k_proj" ,                     # llama-hf nemotron olmoe  
158158            "layers.{bid}.attention.wk" ,                               # llama-pth 
159159            "encoder.layer.{bid}.attention.self.key" ,                  # bert 
160160            "transformer.h.{bid}.attn.k_proj" ,                         # gpt-j 
@@ -167,7 +167,7 @@ class TensorNameMap:
167167
168168        # Attention value 
169169        MODEL_TENSOR .ATTN_V : (
170-             "model.layers.{bid}.self_attn.v_proj" ,                       # llama-hf nemotron 
170+             "model.layers.{bid}.self_attn.v_proj" ,                       # llama-hf nemotron olmoe  
171171            "layers.{bid}.attention.wv" ,                                 # llama-pth 
172172            "encoder.layer.{bid}.attention.self.value" ,                  # bert 
173173            "transformer.h.{bid}.attn.v_proj" ,                           # gpt-j 
@@ -185,7 +185,7 @@ class TensorNameMap:
185185            "transformer.blocks.{bid}.attn.out_proj" ,                       # mpt 
186186            "transformer.h.{bid}.self_attention.dense" ,                     # falcon 
187187            "h.{bid}.self_attention.dense" ,                                 # bloom 
188-             "model.layers.{bid}.self_attn.o_proj" ,                          # llama-hf nemotron 
188+             "model.layers.{bid}.self_attn.o_proj" ,                          # llama-hf nemotron olmoe  
189189            "layers.{bid}.attention.wo" ,                                    # llama-pth 
190190            "encoder.layer.{bid}.attention.output.dense" ,                   # bert 
191191            "transformer.h.{bid}.attn.out_proj" ,                            # gpt-j 
@@ -229,7 +229,7 @@ class TensorNameMap:
229229            "transformer.h.{bid}.ln_2" ,                                      # gpt2 refact qwen jais exaone 
230230            "h.{bid}.post_attention_layernorm" ,                              # bloom 
231231            "transformer.blocks.{bid}.norm_2" ,                               # mpt 
232-             "model.layers.{bid}.post_attention_layernorm" ,                   # llama-hf nemotron 
232+             "model.layers.{bid}.post_attention_layernorm" ,                   # llama-hf nemotron olmoe  
233233            "layers.{bid}.ffn_norm" ,                                         # llama-pth 
234234            "language_model.encoder.layers.{bid}.post_attention_layernorm" ,  # persimmon 
235235            "model.layers.{bid}.ln2" ,                                        # yi 
@@ -253,7 +253,7 @@ class TensorNameMap:
253253        MODEL_TENSOR .FFN_GATE_INP : (
254254            "layers.{bid}.feed_forward.gate" ,             # mixtral 
255255            "model.layers.{bid}.block_sparse_moe.gate" ,   # mixtral 
256-             "model.layers.{bid}.mlp.gate" ,                # qwen2moe 
256+             "model.layers.{bid}.mlp.gate" ,                # qwen2moe olmoe  
257257            "transformer.decoder_layer.{bid}.router" ,     # Grok 
258258            "transformer.blocks.{bid}.ffn.router.layer" ,  # dbrx 
259259        ),
@@ -295,7 +295,7 @@ class TensorNameMap:
295295            "layers.{bid}.feed_forward.experts.w3" ,          # mixtral (merged) 
296296            "transformer.decoder_layer.{bid}.moe.linear_v" ,  # Grok (merged) 
297297            "transformer.blocks.{bid}.ffn.experts.mlp.v1" ,   # dbrx 
298-             "model.layers.{bid}.mlp.experts.up_proj" ,        # qwen2moe (merged) 
298+             "model.layers.{bid}.mlp.experts.up_proj" ,        # qwen2moe olmoe  (merged) 
299299        ),
300300
301301        MODEL_TENSOR .FFN_UP_SHEXP : (
@@ -327,7 +327,7 @@ class TensorNameMap:
327327            "layers.{bid}.feed_forward.experts.w1" ,         # mixtral (merged) 
328328            "transformer.decoder_layer.{bid}.moe.linear" ,   # Grok (merged) 
329329            "transformer.blocks.{bid}.ffn.experts.mlp.w1" ,  # dbrx 
330-             "model.layers.{bid}.mlp.experts.gate_proj" ,     # qwen2moe (merged) 
330+             "model.layers.{bid}.mlp.experts.gate_proj" ,     # qwen2moe olmoe  (merged) 
331331        ),
332332
333333        MODEL_TENSOR .FFN_GATE_SHEXP : (
@@ -367,7 +367,7 @@ class TensorNameMap:
367367            "layers.{bid}.feed_forward.experts.w2" ,          # mixtral (merged) 
368368            "transformer.decoder_layer.{bid}.moe.linear_1" ,  # Grok (merged) 
369369            "transformer.blocks.{bid}.ffn.experts.mlp.w2" ,   # dbrx 
370-             "model.layers.{bid}.mlp.experts.down_proj" ,      # qwen2moe (merged) 
370+             "model.layers.{bid}.mlp.experts.down_proj" ,      # qwen2moe olmoe  (merged) 
371371        ),
372372
373373        MODEL_TENSOR .FFN_DOWN_SHEXP : (
@@ -378,7 +378,7 @@ class TensorNameMap:
378378        MODEL_TENSOR .ATTN_Q_NORM : (
379379            "language_model.encoder.layers.{bid}.self_attention.q_layernorm" ,
380380            "model.layers.{bid}.self_attn.q_layernorm" ,                       # persimmon 
381-             "model.layers.{bid}.self_attn.q_norm" ,                            # cohere 
381+             "model.layers.{bid}.self_attn.q_norm" ,                            # cohere olmoe  
382382            "transformer.blocks.{bid}.attn.q_ln" ,                             # sea-lion 
383383            "encoder.layer.{bid}.attention.self.layer_norm_q" ,                # jina-bert-v2 
384384            "transformer.layers.{bid}.attn.q_norm" ,                           # openelm 
@@ -387,7 +387,7 @@ class TensorNameMap:
387387        MODEL_TENSOR .ATTN_K_NORM : (
388388            "language_model.encoder.layers.{bid}.self_attention.k_layernorm" ,
389389            "model.layers.{bid}.self_attn.k_layernorm" ,                       # persimmon 
390-             "model.layers.{bid}.self_attn.k_norm" ,                            # cohere 
390+             "model.layers.{bid}.self_attn.k_norm" ,                            # cohere olmoe  
391391            "transformer.blocks.{bid}.attn.k_ln" ,                             # sea-lion 
392392            "encoder.layer.{bid}.attention.self.layer_norm_k" ,                # jina-bert-v2 
393393            "transformer.layers.{bid}.attn.k_norm" ,                           # openelm 
0 commit comments