@@ -902,10 +902,12 @@ class TensorNameMap:
902
902
903
903
MODEL_TENSOR .V_MMPROJ_FC : (
904
904
"model.connector.modality_projection.proj" , # SmolVLM
905
+ "multi_modal_projector.linear_1" , # llama 4
905
906
),
906
907
907
908
MODEL_TENSOR .V_MMPROJ_MLP : (
908
909
"model.mm_projector.mlp.mlp.{bid}" ,
910
+ "vision_model.vision_adapter.mlp.fc{bid}" , # llama 4
909
911
"mlp1.{bid}" , # InternVL
910
912
),
911
913
@@ -915,26 +917,30 @@ class TensorNameMap:
915
917
916
918
MODEL_TENSOR .V_ENC_EMBD_CLS : (
917
919
"vision_tower.vision_model.embeddings.class_embedding" ,
920
+ "vision_model.class_embedding" , # llama 4
918
921
),
919
922
920
923
MODEL_TENSOR .V_ENC_EMBD_PATCH : (
921
924
"vision_tower.vision_model.embeddings.patch_embedding" ,
922
925
"vpm.embeddings.patch_embedding" ,
923
926
"model.vision_model.embeddings.patch_embedding" , # SmolVLM
924
927
"vision_tower.patch_conv" , # pixtral
928
+ "vision_model.patch_embedding.linear" , # llama 4
925
929
"visual.patch_embed.proj" , # qwen2vl
926
930
),
927
931
928
932
MODEL_TENSOR .V_ENC_EMBD_POS : (
929
933
"vision_tower.vision_model.embeddings.position_embedding" ,
930
934
"vpm.embeddings.position_embedding" ,
931
935
"model.vision_model.embeddings.position_embedding" , # SmolVLM
936
+ "vision_model.positional_embedding_vlm" , # llama 4
932
937
),
933
938
934
939
MODEL_TENSOR .V_ENC_ATTN_Q : (
935
940
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj" ,
936
941
"vpm.encoder.layers.{bid}.self_attn.q_proj" ,
937
942
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
943
+ "vision_model.model.layers.{bid}.self_attn.q_proj" , # llama4
938
944
"vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral
939
945
"visual.blocks.{bid}.attn.q" , # qwen2vl, generated
940
946
),
@@ -947,6 +953,7 @@ class TensorNameMap:
947
953
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj" ,
948
954
"vpm.encoder.layers.{bid}.self_attn.k_proj" ,
949
955
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
956
+ "vision_model.model.layers.{bid}.self_attn.k_proj" , # llama4
950
957
"vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral
951
958
"visual.blocks.{bid}.attn.k" , # qwen2vl, generated
952
959
),
@@ -959,6 +966,7 @@ class TensorNameMap:
959
966
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj" ,
960
967
"vpm.encoder.layers.{bid}.self_attn.v_proj" ,
961
968
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
969
+ "vision_model.model.layers.{bid}.self_attn.v_proj" , # llama4
962
970
"vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral
963
971
"visual.blocks.{bid}.attn.v" , # qwen2vl, generated
964
972
),
@@ -969,23 +977,26 @@ class TensorNameMap:
969
977
"vpm.encoder.layers.{bid}.layer_norm1" ,
970
978
"model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
971
979
"vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral
980
+ "vision_model.model.layers.{bid}.input_layernorm" , # llama4
972
981
"visual.blocks.{bid}.norm1" , # qwen2vl
973
982
),
974
983
975
- MODEL_TENSOR .V_ENC_OUTPUT : (
984
+ MODEL_TENSOR .V_ENC_ATTN_O : (
976
985
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj" ,
977
986
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj" , # InternVL
978
987
"vpm.encoder.layers.{bid}.self_attn.out_proj" ,
979
988
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
989
+ "vision_model.model.layers.{bid}.self_attn.o_proj" , # llama4
980
990
"vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral
981
991
"visual.blocks.{bid}.attn.proj" , # qwen2vl
982
992
),
983
993
984
- MODEL_TENSOR .V_ENC_OUTPUT_NORM : (
994
+ MODEL_TENSOR .V_ENC_POST_ATTN_NORM : (
985
995
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2" ,
986
996
"vision_tower.vision_model.encoder.layers.{bid}.norm2" , # InternVL
987
997
"vpm.encoder.layers.{bid}.layer_norm2" ,
988
998
"model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
999
+ "vision_model.model.layers.{bid}.post_attention_layernorm" , # llama4
989
1000
"vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral
990
1001
"visual.blocks.{bid}.norm2" , # qwen2vl
991
1002
),
@@ -995,6 +1006,7 @@ class TensorNameMap:
995
1006
"vpm.encoder.layers.{bid}.mlp.fc1" ,
996
1007
"model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3
997
1008
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral
1009
+ "vision_model.model.layers.{bid}.mlp.fc1" , # llama4
998
1010
"visual.blocks.{bid}.mlp.fc1" , # qwen2vl
999
1011
"visual.blocks.{bid}.mlp.up_proj" , # qwen2.5vl
1000
1012
),
@@ -1009,6 +1021,7 @@ class TensorNameMap:
1009
1021
"vpm.encoder.layers.{bid}.mlp.fc2" ,
1010
1022
"model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3
1011
1023
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral
1024
+ "vision_model.model.layers.{bid}.mlp.fc2" , # llama4
1012
1025
"visual.blocks.{bid}.mlp.fc2" , # qwen2vl
1013
1026
"visual.blocks.{bid}.mlp.down_proj" , # qwen2.5vl
1014
1027
),
@@ -1024,11 +1037,13 @@ class TensorNameMap:
1024
1037
MODEL_TENSOR .V_PRE_NORM : (
1025
1038
"vision_tower.vision_model.pre_layrnorm" ,
1026
1039
"vision_tower.ln_pre" , # pixtral
1040
+ "vision_model.layernorm_pre" , # llama4
1027
1041
),
1028
1042
1029
1043
MODEL_TENSOR .V_POST_NORM : (
1030
1044
"vision_tower.vision_model.post_layernorm" ,
1031
1045
"model.vision_model.post_layernorm" , # SmolVLM
1046
+ "vision_model.layernorm_post" , # llama4
1032
1047
"visual.merger.ln_q" , # qwen2vl
1033
1048
),
1034
1049
0 commit comments