@@ -169,6 +169,7 @@ class TensorNameMap:
169169 "model.layers.{bid}.self_attn.q_proj_no_perm" , # llama-custom
170170 "layers.{bid}.attention.wq" , # llama-pth
171171 "encoder.layer.{bid}.attention.self.query" , # bert
172+ "transformer.layer.{bid}.attention.q_lin" , # distillbert
172173 "transformer.h.{bid}.attn.q_proj" , # gpt-j
173174 "model.layers.layers.{bid}.self_attn.q_proj" , # plamo
174175 "model.layers.{bid}.attention.wq" , # internlm2
@@ -183,6 +184,7 @@ class TensorNameMap:
183184 "model.layers.{bid}.self_attn.k_proj_no_perm" , # llama-custom
184185 "layers.{bid}.attention.wk" , # llama-pth
185186 "encoder.layer.{bid}.attention.self.key" , # bert
187+ "transformer.layer.{bid}.attention.k_lin" , # distillbert
186188 "transformer.h.{bid}.attn.k_proj" , # gpt-j
187189 "transformer.h.{bid}.attn.k" , # refact
188190 "model.layers.layers.{bid}.self_attn.k_proj" , # plamo
@@ -197,6 +199,7 @@ class TensorNameMap:
197199 "model.layers.{bid}.self_attn.v_proj" , # llama-hf nemotron olmoe olmo2 phimoe
198200 "layers.{bid}.attention.wv" , # llama-pth
199201 "encoder.layer.{bid}.attention.self.value" , # bert
202+ "transformer.layer.{bid}.attention.v_lin" , # distillbert
200203 "transformer.h.{bid}.attn.v_proj" , # gpt-j
201204 "transformer.h.{bid}.attn.v" , # refact
202205 "model.layers.layers.{bid}.self_attn.v_proj" , # plamo
@@ -217,6 +220,7 @@ class TensorNameMap:
217220 "model.layers.{bid}.self_attn.linear_attn" , # deci
218221 "layers.{bid}.attention.wo" , # llama-pth
219222 "encoder.layer.{bid}.attention.output.dense" , # bert
223+ "transformer.layer.{bid}.attention.out_lin" , # distillbert
220224 "transformer.h.{bid}.attn.out_proj" , # gpt-j
221225 "language_model.encoder.layers.{bid}.self_attention.dense" , # persimmon
222226 "model.layers.{bid}.self_attn.dense" , # persimmon
@@ -237,6 +241,7 @@ class TensorNameMap:
237241 # Attention output norm
238242 MODEL_TENSOR .ATTN_OUT_NORM : (
239243 "encoder.layer.{bid}.attention.output.LayerNorm" , # bert
244+ "transformer.layer.{bid}.sa_layer_norm" , # distillbert
240245 "encoder.layers.{bid}.norm1" , # nomic-bert
241246 "transformer.decoder_layer.{bid}.rms_norm_1" , # Grok
242247 "transformer.blocks.{bid}.norm_attn_norm.norm_2" , # dbrx
@@ -313,6 +318,7 @@ class TensorNameMap:
313318 "model.layers.{bid}.mlp.up_proj" , # llama-hf refact nemotron olmo2
314319 "layers.{bid}.feed_forward.w3" , # llama-pth
315320 "encoder.layer.{bid}.intermediate.dense" , # bert
321+ "transformer.layer.{bid}.ffn.lin1" , # distillbert
316322 "transformer.h.{bid}.mlp.fc_in" , # gpt-j
317323 "transformer.h.{bid}.mlp.linear_3" , # refact
318324 "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h" , # persimmon
@@ -396,6 +402,7 @@ class TensorNameMap:
396402 "model.layers.{bid}.mlp.down_proj" , # llama-hf nemotron olmo2
397403 "layers.{bid}.feed_forward.w2" , # llama-pth
398404 "encoder.layer.{bid}.output.dense" , # bert
405+ "transformer.layer.{bid}.ffn.lin2" , # distillbert
399406 "transformer.h.{bid}.mlp.fc_out" , # gpt-j
400407 "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h" , # persimmon
401408 "model.layers.{bid}.mlp.dense_4h_to_h" , # persimmon
@@ -457,6 +464,7 @@ class TensorNameMap:
457464
458465 MODEL_TENSOR .LAYER_OUT_NORM : (
459466 "encoder.layer.{bid}.output.LayerNorm" , # bert
467+ "transformer.layer.{bid}.output_layer_norm" , # distillbert
460468 "encoder.layers.{bid}.norm2" , # nomic-bert
461469 "transformer.decoder_layer.{bid}.rms_norm_3" , # Grok
462470 "encoder.layer.{bid}.mlp.layernorm" , # jina-bert-v2
@@ -827,6 +835,7 @@ class TensorNameMap:
827835 MODEL_TENSOR .CLS : (
828836 "classifier" , # jina
829837 "classifier.dense" , # roberta
838+ "pre_classifier" , # distillbert
830839 ),
831840
832841 MODEL_TENSOR .CLS_OUT : (
0 commit comments