@@ -896,6 +896,7 @@ class TensorNameMap:
896896
897897 MODEL_TENSOR .V_MMPROJ : (
898898 "multi_modal_projector.linear_{bid}" ,
899+ "visual.merger.mlp.{bid}" , # qwen2vl
899900 ),
900901
901902 MODEL_TENSOR .V_MMPROJ_FC : (
@@ -919,6 +920,11 @@ class TensorNameMap:
919920 "vpm.embeddings.patch_embedding" ,
920921 "model.vision_model.embeddings.patch_embedding" , # SmolVLM
921922 "vision_tower.patch_conv" , # pixtral
923+ "visual.patch_embed.proj" , # qwen2vl
924+ ),
925+
926+ MODEL_TENSOR .V_ENC_EMBD_PATCH1 : (
927+ "visual.patch_embed.proj.weight.1" , # qwen2vl, generated
922928 ),
923929
924930 MODEL_TENSOR .V_ENC_EMBD_POS : (
@@ -932,59 +938,73 @@ class TensorNameMap:
932938 "vpm.encoder.layers.{bid}.self_attn.q_proj" ,
933939 "model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
934940 "vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral
941+ "visual.blocks.{bid}.attn.q" , # qwen2vl, generated
935942 ),
936943
937944 MODEL_TENSOR .V_ENC_ATTN_K : (
938945 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj" ,
939946 "vpm.encoder.layers.{bid}.self_attn.k_proj" ,
940947 "model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
941948 "vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral
949+ "visual.blocks.{bid}.attn.k" , # qwen2vl, generated
942950 ),
943951
944952 MODEL_TENSOR .V_ENC_ATTN_V : (
945953 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj" ,
946954 "vpm.encoder.layers.{bid}.self_attn.v_proj" ,
947955 "model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
948956 "vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral
957+ "visual.blocks.{bid}.attn.v" , # qwen2vl, generated
949958 ),
950959
951960 MODEL_TENSOR .V_ENC_INPUT_NORM : (
952961 "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1" ,
953962 "vpm.encoder.layers.{bid}.layer_norm1" ,
954963 "model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
955964 "vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral
965+ "visual.blocks.{bid}.norm1" , # qwen2vl
956966 ),
957967
958968 MODEL_TENSOR .V_ENC_OUTPUT : (
959969 "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj" ,
960970 "vpm.encoder.layers.{bid}.self_attn.out_proj" ,
961971 "model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
962972 "vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral
973+ "visual.blocks.{bid}.attn.proj" , # qwen2vl
963974 ),
964975
965976 MODEL_TENSOR .V_ENC_OUTPUT_NORM : (
966977 "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2" ,
967978 "vpm.encoder.layers.{bid}.layer_norm2" ,
968979 "model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
969980 "vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral
981+ "visual.blocks.{bid}.norm2" , # qwen2vl
970982 ),
971983
984+ # some namings are messed up because the original llava code swapped fc1 and fc2
985+ # we have no better way to fix it, just be careful
986+ # new models like pixtral use the correct naming
972987 MODEL_TENSOR .V_ENC_FFN_UP : (
973988 "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1" ,
974989 "vpm.encoder.layers.{bid}.mlp.fc1" ,
975990 "model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3 (note: name is swapped)
976991 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral
992+ "visual.blocks.{bid}.mlp.fc2" , # qwen2vl
993+ "visual.blocks.{bid}.mlp.down_proj" , # qwen2.5vl
977994 ),
978995
979996 MODEL_TENSOR .V_ENC_FFN_GATE : (
980997 "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj" , # pixtral
998+ "visual.blocks.{bid}.mlp.gate_proj" , # qwen2.5vl
981999 ),
9821000
9831001 MODEL_TENSOR .V_ENC_FFN_DOWN : (
9841002 "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2" ,
9851003 "vpm.encoder.layers.{bid}.mlp.fc2" ,
9861004 "model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3 (note: name is swapped)
9871005 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral
1006+ "visual.blocks.{bid}.mlp.fc1" , # qwen2vl
1007+ "visual.blocks.{bid}.mlp.up_proj" , # qwen2.5vl
9881008 ),
9891009
9901010 MODEL_TENSOR .V_PRE_NORM : (
@@ -995,6 +1015,7 @@ class TensorNameMap:
9951015 MODEL_TENSOR .V_POST_NORM : (
9961016 "vision_tower.vision_model.post_layernorm" ,
9971017 "model.vision_model.post_layernorm" , # SmolVLM
1018+ "visual.merger.ln_q" , # qwen2vl
9981019 ),
9991020
10001021 MODEL_TENSOR .V_MM_INP_PROJ : (
0 commit comments