Skip to content

Commit 79a5462

Browse files
authored
mtmd : support Kimi VL model (ggml-org#15458)
* convert : fix tensor naming conflict for llama 4 vision * convert ok * support kimi vision model * clean up * fix style * fix calc number of output tokens * refactor resize_position_embeddings * add test case * rename build fn * correct a small bug
1 parent 85cc1ae commit 79a5462

File tree

6 files changed

+211
-61
lines changed

6 files changed

+211
-61
lines changed

convert_hf_to_gguf.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6254,9 +6254,11 @@ def prepare_tensors(self):
62546254
raise ValueError(f"Unprocessed experts: {experts}")
62556255

62566256

6257-
@ModelBase.register("DeepseekV2ForCausalLM")
6258-
@ModelBase.register("DeepseekV3ForCausalLM")
6259-
@ModelBase.register("KimiVLForConditionalGeneration")
6257+
@ModelBase.register(
6258+
"DeepseekV2ForCausalLM",
6259+
"DeepseekV3ForCausalLM",
6260+
"KimiVLForConditionalGeneration",
6261+
)
62606262
class DeepseekV2Model(TextModel):
62616263
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
62626264

@@ -8507,6 +8509,43 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
85078509
return "mm.2.weight"
85088510
return super().map_tensor_name(name, try_suffixes)
85098511

8512+
8513+
@ModelBase.register("KimiVLForConditionalGeneration")
8514+
class KimiVLModel(MmprojModel):
8515+
def __init__(self, *args, **kwargs):
8516+
super().__init__(*args, **kwargs)
8517+
assert self.hparams_vision is not None
8518+
self.hparams_vision["image_size"] = 64 * 14 # for compatibility
8519+
8520+
def set_gguf_parameters(self):
8521+
super().set_gguf_parameters()
8522+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL)
8523+
self.gguf_writer.add_vision_use_gelu(True)
8524+
self.gguf_writer.add_vision_projector_scale_factor(2)
8525+
# eps is the same as pytorch's default value
8526+
assert self.hparams_vision is not None
8527+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5))
8528+
8529+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8530+
del bid # unused
8531+
is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
8532+
8533+
if is_vision_tensor:
8534+
if "pos_emb.weight" in name:
8535+
data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2])
8536+
elif "wqkv" in name:
8537+
split_dim = 0 if "weight" in name else -1
8538+
wq, wk, wv = data_torch.chunk(3, dim=split_dim)
8539+
return [
8540+
(self.map_tensor_name(name.replace("wqkv", "wq")), wq),
8541+
(self.map_tensor_name(name.replace("wqkv", "wk")), wk),
8542+
(self.map_tensor_name(name.replace("wqkv", "wv")), wv)
8543+
]
8544+
8545+
return [(self.map_tensor_name(name), data_torch)]
8546+
8547+
return [] # skip other tensors
8548+
85108549
###### CONVERSION LOGIC ######
85118550

85128551

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2850,6 +2850,7 @@ class VisionProjectorType:
28502850
QWEN25O = "qwen2.5o" # omni
28512851
VOXTRAL = "voxtral"
28522852
LFM2 = "lfm2"
2853+
KIMIVL = "kimivl"
28532854

28542855

28552856
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ class TensorNameMap:
11221122
"vision_encoder.patch_conv", # pixtral
11231123
"vision_model.patch_embedding.linear", # llama 4
11241124
"visual.patch_embed.proj", # qwen2vl
1125+
"vision_tower.patch_embed.proj", # kimi-vl
11251126
),
11261127

11271128
MODEL_TENSOR.V_ENC_EMBD_POS: (
@@ -1130,6 +1131,7 @@ class TensorNameMap:
11301131
"vpm.embeddings.position_embedding",
11311132
"model.vision_model.embeddings.position_embedding", # SmolVLM
11321133
"vision_model.positional_embedding_vlm", # llama 4
1134+
"vision_tower.patch_embed.pos_emb", # kimi-vl
11331135
),
11341136

11351137
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1141,6 +1143,7 @@ class TensorNameMap:
11411143
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
11421144
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
11431145
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
1146+
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
11441147
),
11451148

11461149
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1157,6 +1160,7 @@ class TensorNameMap:
11571160
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
11581161
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
11591162
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
1163+
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
11601164
),
11611165

11621166
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1173,6 +1177,7 @@ class TensorNameMap:
11731177
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
11741178
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
11751179
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
1180+
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
11761181
),
11771182

11781183
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1185,6 +1190,7 @@ class TensorNameMap:
11851190
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
11861191
"vision_model.model.layers.{bid}.input_layernorm", # llama4
11871192
"visual.blocks.{bid}.norm1", # qwen2vl
1193+
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
11881194
),
11891195

11901196
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1197,6 +1203,7 @@ class TensorNameMap:
11971203
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
11981204
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
11991205
"visual.blocks.{bid}.attn.proj", # qwen2vl
1206+
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
12001207
),
12011208

12021209
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1209,6 +1216,7 @@ class TensorNameMap:
12091216
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
12101217
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
12111218
"visual.blocks.{bid}.norm2", # qwen2vl
1219+
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
12121220
),
12131221

12141222
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1221,6 +1229,7 @@ class TensorNameMap:
12211229
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
12221230
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
12231231
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
1232+
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
12241233
),
12251234

12261235
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1239,6 +1248,7 @@ class TensorNameMap:
12391248
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
12401249
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
12411250
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
1251+
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
12421252
),
12431253

12441254
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1263,6 +1273,7 @@ class TensorNameMap:
12631273
"model.vision_model.post_layernorm", # SmolVLM
12641274
"vision_model.layernorm_post", # llama4
12651275
"visual.merger.ln_q", # qwen2vl
1276+
"vision_tower.encoder.final_layernorm", # kimi-vl
12661277
),
12671278

12681279
MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1272,6 +1283,7 @@ class TensorNameMap:
12721283
MODEL_TENSOR.V_MM_INP_NORM: (
12731284
"multi_modal_projector.norm",
12741285
"multi_modal_projector.layer_norm",
1286+
"multi_modal_projector.pre_norm",
12751287
"pre_mm_projector_norm",
12761288
),
12771289

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ enum projector_type {
135135
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
136136
PROJECTOR_TYPE_VOXTRAL,
137137
PROJECTOR_TYPE_LFM2,
138+
PROJECTOR_TYPE_KIMIVL,
138139
PROJECTOR_TYPE_UNKNOWN,
139140
};
140141

@@ -156,6 +157,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
156157
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
157158
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
158159
{ PROJECTOR_TYPE_LFM2, "lfm2"},
160+
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
159161
};
160162

161163
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)