@@ -1122,6 +1122,7 @@ class TensorNameMap:
1122
1122
"vision_encoder.patch_conv" , # pixtral
1123
1123
"vision_model.patch_embedding.linear" , # llama 4
1124
1124
"visual.patch_embed.proj" , # qwen2vl
1125
+ "vision_tower.patch_embed.proj" , # kimi-vl
1125
1126
),
1126
1127
1127
1128
MODEL_TENSOR .V_ENC_EMBD_POS : (
@@ -1130,6 +1131,7 @@ class TensorNameMap:
1130
1131
"vpm.embeddings.position_embedding" ,
1131
1132
"model.vision_model.embeddings.position_embedding" , # SmolVLM
1132
1133
"vision_model.positional_embedding_vlm" , # llama 4
1134
+ "vision_tower.patch_embed.pos_emb" , # kimi-vl
1133
1135
),
1134
1136
1135
1137
MODEL_TENSOR .V_ENC_ATTN_Q : (
@@ -1141,6 +1143,7 @@ class TensorNameMap:
1141
1143
"vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral-hf
1142
1144
"vision_encoder.transformer.layers.{bid}.attention.wq" , # pixtral
1143
1145
"visual.blocks.{bid}.attn.q" , # qwen2vl, generated
1146
+ "vision_tower.encoder.blocks.{bid}.wq" , # kimi-vl, generated
1144
1147
),
1145
1148
1146
1149
MODEL_TENSOR .V_ENC_ATTN_Q_NORM : (
@@ -1157,6 +1160,7 @@ class TensorNameMap:
1157
1160
"vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral-hf
1158
1161
"vision_encoder.transformer.layers.{bid}.attention.wk" , # pixtral
1159
1162
"visual.blocks.{bid}.attn.k" , # qwen2vl, generated
1163
+ "vision_tower.encoder.blocks.{bid}.wk" , # kimi-vl, generated
1160
1164
),
1161
1165
1162
1166
MODEL_TENSOR .V_ENC_ATTN_K_NORM : (
@@ -1173,6 +1177,7 @@ class TensorNameMap:
1173
1177
"vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral-hf
1174
1178
"vision_encoder.transformer.layers.{bid}.attention.wv" , # pixtral
1175
1179
"visual.blocks.{bid}.attn.v" , # qwen2vl, generated
1180
+ "vision_tower.encoder.blocks.{bid}.wv" , # kimi-vl, generated
1176
1181
),
1177
1182
1178
1183
MODEL_TENSOR .V_ENC_INPUT_NORM : (
@@ -1185,6 +1190,7 @@ class TensorNameMap:
1185
1190
"vision_encoder.transformer.layers.{bid}.attention_norm" , # pixtral
1186
1191
"vision_model.model.layers.{bid}.input_layernorm" , # llama4
1187
1192
"visual.blocks.{bid}.norm1" , # qwen2vl
1193
+ "vision_tower.encoder.blocks.{bid}.norm0" , # kimi-vl (norm0/norm1)
1188
1194
),
1189
1195
1190
1196
MODEL_TENSOR .V_ENC_ATTN_O : (
@@ -1197,6 +1203,7 @@ class TensorNameMap:
1197
1203
"vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral-hf
1198
1204
"vision_encoder.transformer.layers.{bid}.attention.wo" , # pixtral
1199
1205
"visual.blocks.{bid}.attn.proj" , # qwen2vl
1206
+ "vision_tower.encoder.blocks.{bid}.wo" , # kimi-vl
1200
1207
),
1201
1208
1202
1209
MODEL_TENSOR .V_ENC_POST_ATTN_NORM : (
@@ -1209,6 +1216,7 @@ class TensorNameMap:
1209
1216
"vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral-hf
1210
1217
"vision_encoder.transformer.layers.{bid}.ffn_norm" , # pixtral
1211
1218
"visual.blocks.{bid}.norm2" , # qwen2vl
1219
+ "vision_tower.encoder.blocks.{bid}.norm1" , # kimi-vl (norm0/norm1)
1212
1220
),
1213
1221
1214
1222
MODEL_TENSOR .V_ENC_FFN_UP : (
@@ -1221,6 +1229,7 @@ class TensorNameMap:
1221
1229
"vision_model.model.layers.{bid}.mlp.fc1" , # llama4
1222
1230
"visual.blocks.{bid}.mlp.fc1" , # qwen2vl
1223
1231
"visual.blocks.{bid}.mlp.up_proj" , # qwen2.5vl
1232
+ "vision_tower.encoder.blocks.{bid}.mlp.fc0" , # kimi-vl (fc0/fc1)
1224
1233
),
1225
1234
1226
1235
MODEL_TENSOR .V_ENC_FFN_GATE : (
@@ -1239,6 +1248,7 @@ class TensorNameMap:
1239
1248
"vision_model.model.layers.{bid}.mlp.fc2" , # llama4
1240
1249
"visual.blocks.{bid}.mlp.fc2" , # qwen2vl
1241
1250
"visual.blocks.{bid}.mlp.down_proj" , # qwen2.5vl
1251
+ "vision_tower.encoder.blocks.{bid}.mlp.fc1" , # kimi-vl (fc0/fc1)
1242
1252
),
1243
1253
1244
1254
MODEL_TENSOR .V_LAYER_SCALE_1 : (
@@ -1263,6 +1273,7 @@ class TensorNameMap:
1263
1273
"model.vision_model.post_layernorm" , # SmolVLM
1264
1274
"vision_model.layernorm_post" , # llama4
1265
1275
"visual.merger.ln_q" , # qwen2vl
1276
+ "vision_tower.encoder.final_layernorm" , # kimi-vl
1266
1277
),
1267
1278
1268
1279
MODEL_TENSOR .V_MM_INP_PROJ : (
@@ -1272,6 +1283,7 @@ class TensorNameMap:
1272
1283
MODEL_TENSOR .V_MM_INP_NORM : (
1273
1284
"multi_modal_projector.norm" ,
1274
1285
"multi_modal_projector.layer_norm" ,
1286
+ "multi_modal_projector.pre_norm" ,
1275
1287
"pre_mm_projector_norm" ,
1276
1288
),
1277
1289
0 commit comments