@@ -1119,7 +1119,8 @@ class TensorNameMap:
11191119 "model.vision_tower.embeddings.patch_embeddings.projection" , # Intern-S1
11201120 "vpm.embeddings.patch_embedding" ,
11211121 "model.vision_model.embeddings.patch_embedding" , # SmolVLM
1122- "vision_tower.patch_conv" , # pixtral
1122+ "vision_tower.patch_conv" , # pixtral-hf
1123+ "vision_encoder.patch_conv" , # pixtral
11231124 "vision_model.patch_embedding.linear" , # llama 4
11241125 "visual.patch_embed.proj" , # qwen2vl
11251126 ),
@@ -1138,7 +1139,8 @@ class TensorNameMap:
11381139 "vpm.encoder.layers.{bid}.self_attn.q_proj" ,
11391140 "model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
11401141 "vision_model.model.layers.{bid}.self_attn.q_proj" , # llama4
1141- "vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral
1142+ "vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral-hf
1143+ "vision_encoder.transformer.layers.{bid}.attention.wq" , # pixtral
11421144 "visual.blocks.{bid}.attn.q" , # qwen2vl, generated
11431145 ),
11441146
@@ -1153,7 +1155,8 @@ class TensorNameMap:
11531155 "vpm.encoder.layers.{bid}.self_attn.k_proj" ,
11541156 "model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
11551157 "vision_model.model.layers.{bid}.self_attn.k_proj" , # llama4
1156- "vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral
1158+ "vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral-hf
1159+ "vision_encoder.transformer.layers.{bid}.attention.wk" , # pixtral
11571160 "visual.blocks.{bid}.attn.k" , # qwen2vl, generated
11581161 ),
11591162
@@ -1168,7 +1171,8 @@ class TensorNameMap:
11681171 "vpm.encoder.layers.{bid}.self_attn.v_proj" ,
11691172 "model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
11701173 "vision_model.model.layers.{bid}.self_attn.v_proj" , # llama4
1171- "vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral
1174+ "vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral-hf
1175+ "vision_encoder.transformer.layers.{bid}.attention.wv" , # pixtral
11721176 "visual.blocks.{bid}.attn.v" , # qwen2vl, generated
11731177 ),
11741178
@@ -1178,7 +1182,8 @@ class TensorNameMap:
11781182 "model.vision_tower.encoder.layer.{bid}.layernorm_before" , # Intern-S1
11791183 "vpm.encoder.layers.{bid}.layer_norm1" ,
11801184 "model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
1181- "vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral
1185+ "vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral-hf
1186+ "vision_encoder.transformer.layers.{bid}.attention_norm" , # pixtral
11821187 "vision_model.model.layers.{bid}.input_layernorm" , # llama4
11831188 "visual.blocks.{bid}.norm1" , # qwen2vl
11841189 ),
@@ -1190,7 +1195,8 @@ class TensorNameMap:
11901195 "vpm.encoder.layers.{bid}.self_attn.out_proj" ,
11911196 "model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
11921197 "vision_model.model.layers.{bid}.self_attn.o_proj" , # llama4
1193- "vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral
1198+ "vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral-hf
1199+ "vision_encoder.transformer.layers.{bid}.attention.wo" , # pixtral
11941200 "visual.blocks.{bid}.attn.proj" , # qwen2vl
11951201 ),
11961202
@@ -1201,7 +1207,8 @@ class TensorNameMap:
12011207 "vpm.encoder.layers.{bid}.layer_norm2" ,
12021208 "model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
12031209 "vision_model.model.layers.{bid}.post_attention_layernorm" , # llama4
1204- "vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral
1210+ "vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral-hf
1211+ "vision_encoder.transformer.layers.{bid}.ffn_norm" , # pixtral
12051212 "visual.blocks.{bid}.norm2" , # qwen2vl
12061213 ),
12071214
@@ -1210,14 +1217,16 @@ class TensorNameMap:
12101217 "model.vision_tower.encoder.layer.{bid}.mlp.fc1" , # Intern-S1
12111218 "vpm.encoder.layers.{bid}.mlp.fc1" ,
12121219 "model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3
1213- "vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral
1220+ "vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral-hf
1221+ "vision_encoder.transformer.layers.{bid}.feed_forward.w3" , # pixtral
12141222 "vision_model.model.layers.{bid}.mlp.fc1" , # llama4
12151223 "visual.blocks.{bid}.mlp.fc1" , # qwen2vl
12161224 "visual.blocks.{bid}.mlp.up_proj" , # qwen2.5vl
12171225 ),
12181226
12191227 MODEL_TENSOR .V_ENC_FFN_GATE : (
1220- "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj" , # pixtral
1228+ "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj" , # pixtral-hf
1229+ "vision_encoder.transformer.layers.{bid}.feed_forward.w1" , # pixtral
12211230 "visual.blocks.{bid}.mlp.gate_proj" , # qwen2.5vl
12221231 ),
12231232
@@ -1226,7 +1235,8 @@ class TensorNameMap:
12261235 "model.vision_tower.encoder.layer.{bid}.mlp.fc2" , # Intern-S1
12271236 "vpm.encoder.layers.{bid}.mlp.fc2" ,
12281237 "model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3
1229- "vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral
1238+ "vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral-hf
1239+ "vision_encoder.transformer.layers.{bid}.feed_forward.w2" , # pixtral
12301240 "vision_model.model.layers.{bid}.mlp.fc2" , # llama4
12311241 "visual.blocks.{bid}.mlp.fc2" , # qwen2vl
12321242 "visual.blocks.{bid}.mlp.down_proj" , # qwen2.5vl
@@ -1244,7 +1254,8 @@ class TensorNameMap:
12441254
12451255 MODEL_TENSOR .V_PRE_NORM : (
12461256 "vision_tower.vision_model.pre_layrnorm" ,
1247- "vision_tower.ln_pre" , # pixtral
1257+ "vision_tower.ln_pre" , # pixtral-hf
1258+ "vision_encoder.ln_pre" , # pixtral
12481259 "vision_model.layernorm_pre" , # llama4
12491260 ),
12501261
@@ -1261,6 +1272,7 @@ class TensorNameMap:
12611272
12621273 MODEL_TENSOR .V_MM_INP_NORM : (
12631274 "multi_modal_projector.norm" ,
1275+ "pre_mm_projector_norm" ,
12641276 ),
12651277
12661278 MODEL_TENSOR .V_MM_SOFT_EMB_NORM : (
@@ -1316,7 +1328,8 @@ class TensorNameMap:
13161328 ),
13171329
13181330 MODEL_TENSOR .V_MM_PATCH_MERGER : (
1319- "multi_modal_projector.patch_merger.merging_layer" , # mistral small 3.1
1331+ "multi_modal_projector.patch_merger.merging_layer" , # mistral small 3.1 - hf
1332+ "patch_merger.merging_layer" , # mistral
13201333 ),
13211334
13221335 # audio (mtmd)
0 commit comments