Skip to content

Commit cf9e564

Browse files
gryffindor-rrlzhang
andauthored
mtmd : Fix MinicpmV model converter and clip to avoid using hardcode. (#14750)
* Fix MinicpmV model converter and clip to avoid using hardcode. * Code update for pr/14750 * Remove unused field, update script path in docs. * Add version 5 for fallback code. --------- Co-authored-by: lzhang <[email protected]>
1 parent fba5c0d commit cf9e564

File tree

6 files changed

+117
-79
lines changed

6 files changed

+117
-79
lines changed

docs/multimodal/minicpmo2.6.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
1313

1414
Clone llama.cpp:
1515
```bash
16-
git clone https://github.com/ggerganov/llama.cpp
16+
git clone https://github.com/ggml-org/llama.cpp
1717
cd llama.cpp
1818
```
1919

docs/multimodal/minicpmv2.6.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
1212

1313
Clone llama.cpp:
1414
```bash
15-
git clone https://github.com/ggerganov/llama.cpp
15+
git clone https://github.com/ggml-org/llama.cpp
1616
cd llama.cpp
1717
```
1818

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
4545
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
4646
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
47+
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
4748

4849
// audio-specific
4950
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"

tools/mtmd/clip.cpp

Lines changed: 33 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ struct clip_hparams {
201201
// legacy
202202
bool has_llava_projector = false;
203203
int minicpmv_version = 0;
204+
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
204205
};
205206

206207
struct clip_layer {
@@ -866,21 +867,8 @@ struct clip_graph {
866867
int n_embd = clip_n_mmproj_embd(ctx);
867868
const int d_head = 128;
868869
int n_head = n_embd/d_head;
869-
int num_query = 96;
870-
if (ctx->model.hparams.minicpmv_version == 2) {
871-
// MiniCPM-V 2.5
872-
num_query = 96;
873-
} else if (ctx->model.hparams.minicpmv_version == 3) {
874-
// MiniCPM-V 2.6
875-
num_query = 64;
876-
} else if (ctx->model.hparams.minicpmv_version == 4) {
877-
// MiniCPM-o 2.6
878-
num_query = 64;
879-
} else if (ctx->model.hparams.minicpmv_version == 5) {
880-
// MiniCPM-V 4.0
881-
num_query = 64;
882-
}
883-
870+
// Use actual config value if available, otherwise fall back to hardcoded values
871+
int num_query = ctx->model.hparams.minicpmv_query_num;
884872
ggml_tensor * Q = ggml_add(ctx0,
885873
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
886874
model.mm_model_attn_q_b);
@@ -2138,7 +2126,19 @@ struct clip_model_loader {
21382126
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
21392127
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
21402128
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
2141-
2129+
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
2130+
if (hparams.minicpmv_query_num == 0) {
2131+
// Fallback to hardcoded values for legacy models
2132+
if (hparams.minicpmv_version == 3) {
2133+
hparams.minicpmv_query_num = 64;
2134+
} else if (hparams.minicpmv_version == 4) {
2135+
hparams.minicpmv_query_num = 64;
2136+
} else if (hparams.minicpmv_version == 5) {
2137+
hparams.minicpmv_query_num = 64;
2138+
} else {
2139+
hparams.minicpmv_query_num = 96;
2140+
}
2141+
}
21422142
} else if (is_audio) {
21432143
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
21442144

@@ -3556,20 +3556,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
35563556
} break;
35573557
case PROJECTOR_TYPE_MINICPMV:
35583558
{
3559-
if (params.minicpmv_version == 2) {
3560-
// MiniCPM-V 2.5
3561-
n_patches_sq = 96;
3562-
} else if (params.minicpmv_version == 3) {
3563-
// MiniCPM-V 2.6
3564-
n_patches_sq = 64;
3565-
} else if (params.minicpmv_version == 4) {
3566-
// MiniCPM-o 2.6
3567-
n_patches_sq = 64;
3568-
} else if (params.minicpmv_version == 5) {
3569-
// MiniCPM-V 4.0
3570-
n_patches_sq = 64;
3559+
// Use actual config value if available, otherwise fall back to hardcoded values
3560+
if (params.minicpmv_query_num > 0) {
3561+
n_patches_sq = params.minicpmv_query_num;
35713562
} else {
3572-
GGML_ABORT("Unknown minicpmv version");
3563+
// Fallback to hardcoded values for legacy models
3564+
if (params.minicpmv_version == 2) {
3565+
n_patches_sq = 96;
3566+
} else if (params.minicpmv_version == 3) {
3567+
n_patches_sq = 64;
3568+
} else if (params.minicpmv_version == 4) {
3569+
n_patches_sq = 64;
3570+
} else if (params.minicpmv_version == 5) {
3571+
// MiniCPM-V 4.0
3572+
n_patches_sq = 64;
3573+
} else {
3574+
GGML_ABORT("Unknown minicpmv version");
3575+
}
35733576
}
35743577
} break;
35753578
case PROJECTOR_TYPE_QWEN2VL:
@@ -4102,7 +4105,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
41024105
}
41034106

41044107
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
4105-
const auto & hparams = ctx->model.hparams;
41064108
switch (ctx->model.proj_type) {
41074109
case PROJECTOR_TYPE_LDP:
41084110
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
@@ -4114,20 +4116,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
41144116
case PROJECTOR_TYPE_MLP_NORM:
41154117
return ctx->model.mm_3_b->ne[0];
41164118
case PROJECTOR_TYPE_MINICPMV:
4117-
if (hparams.minicpmv_version == 2) {
4118-
// MiniCPM-V 2.5
4119-
return 4096;
4120-
} else if (hparams.minicpmv_version == 3) {
4121-
// MiniCPM-V 2.6
4122-
return 3584;
4123-
} else if (hparams.minicpmv_version == 4) {
4124-
// MiniCPM-o 2.6
4125-
return 3584;
4126-
} else if (hparams.minicpmv_version == 5) {
4127-
// MiniCPM-V 4.0
4128-
return 2560;
4129-
}
4130-
GGML_ABORT("Unknown minicpmv version");
4119+
return ctx->model.mm_model_proj->ne[0];
41314120
case PROJECTOR_TYPE_GLM_EDGE:
41324121
return ctx->model.mm_model_mlp_3_w->ne[1];
41334122
case PROJECTOR_TYPE_QWEN2VL:

tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py

Lines changed: 79 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,16 @@ def bytes_to_unicode():
517517
# output in the same directory as the model if output_dir is None
518518
dir_model = args.model_dir
519519

520+
# Read config.json to get actual model configuration
521+
config_path = os.path.join(dir_model, "config.json")
522+
model_config = {}
523+
if os.path.isfile(config_path):
524+
with open(config_path, "r", encoding="utf-8") as f:
525+
model_config = json.load(f)
526+
print(f"Loaded config from {config_path}")
527+
else:
528+
print(f"Warning: config.json not found at {config_path}")
529+
520530
# If minicpmv_projector is not specified but the default path exists, use the default path
521531
if args.minicpmv_projector is None:
522532
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
@@ -555,37 +565,62 @@ def bytes_to_unicode():
555565
# processor = CLIPProcessor.from_pretrained(dir_model)
556566

557567
minicpmv_version = args.minicpmv_version
558-
emb_dim = 4096
559-
block_count = 26
560-
if minicpmv_version == 1: # MiniCPM-V 2.0
561-
emb_dim = 2304
562-
block_count = 26
563-
elif minicpmv_version == 2: # MiniCPM-V 2.5
564-
emb_dim = 4096
565-
block_count = 27
566-
elif minicpmv_version == 3: # MiniCPM-V 2.6
567-
emb_dim = 3584
568-
block_count = 27
569-
elif minicpmv_version == 4: # MiniCPM-o 2.6
570-
emb_dim = 3584
571-
block_count = 27
572-
elif minicpmv_version == 5: # MiniCPM-V 4.0
573-
emb_dim = 2560
574-
block_count = 27
575-
576-
default_vision_config = {
577-
"hidden_size": 1152,
578-
"image_size": 980,
579-
"intermediate_size": 4304,
580-
"model_type": "idefics2",
581-
"num_attention_heads": 16,
582-
"num_hidden_layers": 27,
583-
"patch_size": 14,
568+
569+
# Use actual config values instead of hardcoded ones
570+
if model_config:
571+
# For the projector/resampler, use the main model's hidden_size
572+
emb_dim = model_config.get("hidden_size", 1536)
573+
574+
# For the vision model, use vision_config values
575+
vision_config_dict = model_config.get("vision_config", {})
576+
default_vision_config = {
577+
"hidden_size": vision_config_dict.get("hidden_size", 1152),
578+
"image_size": vision_config_dict.get("image_size", 980),
579+
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
580+
"model_type": vision_config_dict.get("model_type", "siglip"),
581+
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
582+
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
583+
"patch_size": vision_config_dict.get("patch_size", 14),
584584
}
585585

586+
# Use vision model's num_hidden_layers for block_count
587+
block_count = vision_config_dict.get("num_hidden_layers", 27)
588+
589+
print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
590+
print(f"Vision config: {default_vision_config}")
591+
else:
592+
# Fallback to original hardcoded logic if config.json not found
593+
emb_dim = 4096
594+
block_count = 26
595+
if minicpmv_version == 1:
596+
emb_dim = 2304
597+
block_count = 26
598+
elif minicpmv_version == 2:
599+
emb_dim = 4096
600+
block_count = 27
601+
elif minicpmv_version == 3:
602+
emb_dim = 3584
603+
block_count = 27
604+
elif minicpmv_version == 4:
605+
emb_dim = 3584
606+
block_count = 27
607+
elif minicpmv_version == 5:
608+
emb_dim = 2560
609+
block_count = 27
610+
611+
default_vision_config = {
612+
"hidden_size": 1152,
613+
"image_size": 980,
614+
"intermediate_size": 4304,
615+
"model_type": "idefics2",
616+
"num_attention_heads": 16,
617+
"num_hidden_layers": 27,
618+
"patch_size": 14,
619+
}
620+
586621
vision_config = Idefics2VisionConfig(**default_vision_config)
587622
model = Idefics2VisionTransformer(vision_config)
588-
if minicpmv_version == 3:
623+
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
589624
vision_config = SiglipVisionConfig(**default_vision_config)
590625
model = SiglipVisionTransformer(vision_config)
591626
elif minicpmv_version == 4:
@@ -644,16 +679,27 @@ def bytes_to_unicode():
644679
fout.add_description("two-tower CLIP model")
645680

646681
if has_vision_encoder:
647-
# vision_model hparams
648-
fout.add_uint32("clip.vision.image_size", 448)
649-
fout.add_uint32("clip.vision.patch_size", 14)
650-
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
651-
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
682+
# vision_model hparams - use actual config values
683+
vision_image_size = model_config.get("image_size", 448) if model_config else 448
684+
vision_patch_size = default_vision_config.get("patch_size", 14)
685+
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
686+
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
687+
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
688+
689+
fout.add_uint32("clip.vision.image_size", vision_image_size)
690+
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
691+
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
692+
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
652693
fout.add_uint32("clip.vision.projection_dim", 0)
653-
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
694+
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
654695
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
655696
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
656697

698+
# Add MiniCPM-V specific parameters
699+
query_num = model_config.get("query_num", 0) if model_config else 0
700+
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
701+
fout.add_uint32("clip.minicpmv_query_num", query_num)
702+
657703
if processor is not None:
658704
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
659705
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std

tools/mtmd/legacy-models/minicpmv-surgery.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
# store these tensors in a new dictionary and torch.save them
1818
projector = {name: checkpoint[name].float() for name in mm_tensors}
19+
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
20+
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
1921
torch.save(projector, f"{args.model}/minicpmv.projector")
2022

2123
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]

0 commit comments

Comments
 (0)