@@ -10,7 +10,7 @@ class TensorNameMap:
1010 # Token embeddings
1111 MODEL_TENSOR .TOKEN_EMBD : (
1212 "gpt_neox.embed_in" , # gptneox
13- "transformer.wte" , # gpt2 gpt-j mpt refact qwen
13+ "transformer.wte" , # gpt2 gpt-j mpt refact qwen dbrx
1414 "transformer.word_embeddings" , # falcon
1515 "word_embeddings" , # bloom
1616 "model.embed_tokens" , # llama-hf
@@ -48,7 +48,7 @@ class TensorNameMap:
4848 # Output
4949 MODEL_TENSOR .OUTPUT : (
5050 "embed_out" , # gptneox
51- "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba
51+ "lm_head" , # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
5252 "output" , # llama-pth bloom internlm2
5353 "word_embeddings_for_head" , # persimmon
5454 "lm_head.linear" , # phi2
@@ -60,7 +60,7 @@ class TensorNameMap:
6060 "transformer.ln_f" , # gpt2 gpt-j falcon
6161 "model.norm" , # llama-hf baichuan internlm2
6262 "norm" , # llama-pth
63- "transformer.norm_f" , # mpt
63+ "transformer.norm_f" , # mpt dbrx
6464 "ln_f" , # refact bloom qwen gpt2
6565 "language_model.encoder.final_layernorm" , # persimmon
6666 "model.final_layernorm" , # persimmon
@@ -96,6 +96,7 @@ class TensorNameMap:
9696 "model.layers.{bid}.norm" , # mamba-qbert
9797 "backbone.layers.{bid}.norm" , # mamba
9898 "transformer.decoder_layer.{bid}.rms_norm" , # Grok
99+ "transformer.blocks.{bid}.norm_attn_norm.norm_1" , # dbrx
99100 ),
100101
101102 # Attention norm 2
@@ -108,6 +109,7 @@ class TensorNameMap:
108109 "gpt_neox.layers.{bid}.attention.query_key_value" , # gptneox
109110 "transformer.h.{bid}.attn.c_attn" , # gpt2 qwen
110111 "transformer.blocks.{bid}.attn.Wqkv" , # mpt
112+ "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv" , # dbrx
111113 "transformer.h.{bid}.self_attention.query_key_value" , # falcon
112114 "h.{bid}.self_attention.query_key_value" , # bloom
113115 "language_model.encoder.layers.{bid}.self_attention.query_key_value" , # persimmon
@@ -152,30 +154,32 @@ class TensorNameMap:
152154
153155 # Attention output
154156 MODEL_TENSOR .ATTN_OUT : (
155- "gpt_neox.layers.{bid}.attention.dense" , # gptneox
156- "transformer.h.{bid}.attn.c_proj" , # gpt2 refact qwen
157- "transformer.blocks.{bid}.attn.out_proj" , # mpt
158- "transformer.h.{bid}.self_attention.dense" , # falcon
159- "h.{bid}.self_attention.dense" , # bloom
160- "model.layers.{bid}.self_attn.o_proj" , # llama-hf
161- "layers.{bid}.attention.wo" , # llama-pth
162- "encoder.layer.{bid}.attention.output.dense" , # bert
163- "transformer.h.{bid}.attn.out_proj" , # gpt-j
164- "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
165- "model.layers.{bid}.self_attn.dense" , # persimmon
166- "h.{bid}.attn.c_proj" , # gpt2
167- "transformer.h.{bid}.mixer.out_proj" , # phi2
168- "model.layers.layers.{bid}.self_attn.o_proj" , # plamo
169- "model.layers.{bid}.attention.wo" , # internlm2
170- "encoder.layers.{bid}.attn.out_proj" , # nomic-bert
171- "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
157+ "gpt_neox.layers.{bid}.attention.dense" , # gptneox
158+ "transformer.h.{bid}.attn.c_proj" , # gpt2 refact qwen
159+ "transformer.blocks.{bid}.attn.out_proj" , # mpt
160+ "transformer.h.{bid}.self_attention.dense" , # falcon
161+ "h.{bid}.self_attention.dense" , # bloom
162+ "model.layers.{bid}.self_attn.o_proj" , # llama-hf
163+ "layers.{bid}.attention.wo" , # llama-pth
164+ "encoder.layer.{bid}.attention.output.dense" , # bert
165+ "transformer.h.{bid}.attn.out_proj" , # gpt-j
166+ "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
167+ "model.layers.{bid}.self_attn.dense" , # persimmon
168+ "h.{bid}.attn.c_proj" , # gpt2
169+ "transformer.h.{bid}.mixer.out_proj" , # phi2
170+ "model.layers.layers.{bid}.self_attn.o_proj" , # plamo
171+ "model.layers.{bid}.attention.wo" , # internlm2
172+ "encoder.layers.{bid}.attn.out_proj" , # nomic-bert
173+ "transformer.decoder_layer.{bid}.multi_head_attention.linear" , # Grok
174+ "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj" , # dbrx
172175 ),
173176
174177 # Attention output norm
175178 MODEL_TENSOR .ATTN_OUT_NORM : (
176179 "encoder.layer.{bid}.attention.output.LayerNorm" , # bert
177180 "encoder.layers.{bid}.norm1" , # nomic-bert
178181 "transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
182+ "transformer.blocks.{bid}.norm_attn_norm.norm_2" , # dbrx
179183 ),
180184
181185 # Rotary embeddings
@@ -202,9 +206,10 @@ class TensorNameMap:
202206 ),
203207
204208 MODEL_TENSOR .FFN_GATE_INP : (
205- "layers.{bid}.feed_forward.gate" , # mixtral
206- "model.layers.{bid}.block_sparse_moe.gate" , # mixtral
207- "transformer.decoder_layer.{bid}.router" # Grok
209+ "layers.{bid}.feed_forward.gate" , # mixtral
210+ "model.layers.{bid}.block_sparse_moe.gate" , # mixtral
211+ "transformer.decoder_layer.{bid}.router" , # Grok
212+ "transformer.blocks.{bid}.ffn.router.layer" , # dbrx
208213 ),
209214
210215 # Feed-forward up
@@ -233,6 +238,7 @@ class TensorNameMap:
233238 MODEL_TENSOR .FFN_UP_EXP : (
234239 "layers.{bid}.feed_forward.experts.w3" , # mixtral (merged)
235240 "transformer.decoder_layer.{bid}.moe.linear_v" , # Grok (merged)
241+ "transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
236242 ),
237243
238244 # AWQ-activation gate
@@ -251,8 +257,9 @@ class TensorNameMap:
251257 ),
252258
253259 MODEL_TENSOR .FFN_GATE_EXP : (
254- "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
255- "transformer.decoder_layer.{bid}.moe.linear" # Grok (merged)
260+ "layers.{bid}.feed_forward.experts.w1" , # mixtral (merged)
261+ "transformer.decoder_layer.{bid}.moe.linear" , # Grok (merged)
262+ "transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
256263 ),
257264
258265 # Feed-forward down
@@ -280,6 +287,7 @@ class TensorNameMap:
280287 MODEL_TENSOR .FFN_DOWN_EXP : (
281288 "layers.{bid}.feed_forward.experts.w2" , # mixtral (merged)
282289 "transformer.decoder_layer.{bid}.moe.linear_1" , # Grok (merged)
290+ "transformer.blocks.{bid}.ffn.experts.mlp.w2" , # dbrx
283291 ),
284292
285293 MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments