Skip to content

Commit 4621d99

Browse files
committed
gonna fix them all
1 parent 2c0d960 commit 4621d99

File tree

1 file changed

+33
-55
lines changed

1 file changed

+33
-55
lines changed

tools/mtmd/clip.cpp

Lines changed: 33 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ struct clip_hparams {
174174
int32_t image_longest_edge = 0;
175175
int32_t image_min_pixels = 0;
176176
int32_t image_max_pixels = 0;
177-
int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2
177+
int32_t n_merge = 0; // number of patch merges **per-side**
178178

179179
float image_mean[3];
180180
float image_std[3];
@@ -207,7 +207,8 @@ struct clip_hparams {
207207
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
208208

209209
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
210-
const int patch_area = patch_size * patch_size * proj_scale_factor;
210+
const int cur_merge = n_merge == 0 ? 1 : n_merge;
211+
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
211212
image_min_pixels = n_tokens_min * patch_area;
212213
image_max_pixels = n_tokens_max * patch_area;
213214
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
@@ -216,11 +217,8 @@ struct clip_hparams {
216217
void set_warmup_n_tokens(int n_tokens) {
217218
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
218219
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
219-
warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size();
220-
}
221-
222-
int get_merge_kernel_size() const {
223-
return static_cast<int>(std::sqrt(proj_scale_factor));
220+
const int cur_merge = n_merge == 0 ? 1 : n_merge;
221+
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
224222
}
225223
};
226224

@@ -550,7 +548,7 @@ struct clip_graph {
550548
const int batch_size = 1;
551549
GGML_ASSERT(n_patches_x == n_patches_y);
552550
const int patches_per_image = n_patches_x;
553-
const int kernel_size = hparams.proj_scale_factor;
551+
const int kernel_size = hparams.n_merge;
554552

555553
cur = ggml_transpose(ctx0, cur);
556554
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -572,13 +570,13 @@ struct clip_graph {
572570
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
573571
// pixel_shuffle
574572
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
575-
const int scale_factor = model.hparams.proj_scale_factor;
573+
const int scale_factor = model.hparams.n_merge;
576574
cur = build_patch_merge_permute(cur, scale_factor);
577575
cur = ggml_mul_mat(ctx0, model.projection, cur);
578576

579577
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
580578
// pixel unshuffle block
581-
const int scale_factor = model.hparams.get_merge_kernel_size();
579+
const int scale_factor = model.hparams.n_merge;
582580
cur = build_patch_merge_permute(cur, scale_factor);
583581

584582
// projection
@@ -602,7 +600,7 @@ struct clip_graph {
602600
}
603601

604602
ggml_cgraph * build_pixtral() {
605-
const int n_merge = hparams.get_merge_kernel_size();
603+
const int n_merge = hparams.n_merge;
606604

607605
// 2D input positions
608606
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -628,7 +626,7 @@ struct clip_graph {
628626
// mistral small 3.1 patch merger
629627
// ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
630628
if (model.mm_patch_merger_w) {
631-
GGML_ASSERT(hparams.proj_scale_factor > 0);
629+
GGML_ASSERT(hparams.n_merge > 0);
632630

633631
cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
634632

@@ -944,8 +942,7 @@ struct clip_graph {
944942

945943
// deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
946944
ggml_tensor * deepstack_features = nullptr;
947-
const int merge_factor = hparams.proj_scale_factor > 0
948-
? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
945+
const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
949946

950947
// loop over layers
951948
for (int il = 0; il < n_layer; il++) {
@@ -1168,7 +1165,7 @@ struct clip_graph {
11681165

11691166
// pixel shuffle
11701167
{
1171-
const int scale_factor = model.hparams.proj_scale_factor;
1168+
const int scale_factor = model.hparams.n_merge;
11721169
const int bsz = 1; // batch size, always 1 for now since we don't support batching
11731170
const int height = n_patches_y;
11741171
const int width = n_patches_x;
@@ -1258,7 +1255,7 @@ struct clip_graph {
12581255
// based on Llama4VisionPixelShuffleMLP
12591256
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
12601257
{
1261-
const int scale_factor = model.hparams.proj_scale_factor;
1258+
const int scale_factor = model.hparams.n_merge;
12621259
const int bsz = 1; // batch size, always 1 for now since we don't support batching
12631260
GGML_ASSERT(scale_factor > 0);
12641261
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
@@ -1330,7 +1327,7 @@ struct clip_graph {
13301327

13311328
{
13321329
// patch_merger
1333-
const int scale_factor = model.hparams.proj_scale_factor;
1330+
const int scale_factor = model.hparams.n_merge;
13341331
cur = build_patch_merge_permute(cur, scale_factor);
13351332

13361333
// projection norm
@@ -2706,19 +2703,16 @@ struct clip_model_loader {
27062703
} break;
27072704
case PROJECTOR_TYPE_INTERNVL:
27082705
{
2709-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2706+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27102707
} break;
27112708
case PROJECTOR_TYPE_IDEFICS3:
27122709
{
2713-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2710+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27142711
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
27152712
} break;
27162713
case PROJECTOR_TYPE_LFM2:
27172714
{
2718-
// correct non-standard proj_scale_factor value
2719-
int spatial_merge = 2;
2720-
get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
2721-
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2715+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27222716
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
27232717
hparams.set_limit_image_tokens(64, 256);
27242718
} break;
@@ -2728,16 +2722,14 @@ struct clip_model_loader {
27282722
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
27292723
// TODO: verify the image_min_tokens
27302724
hparams.rope_theta = 10000.0f;
2731-
int spatial_merge = 2;
2732-
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
2733-
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2725+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
27342726
hparams.set_limit_image_tokens(8, 1024);
27352727
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27362728
} break;
27372729
case PROJECTOR_TYPE_KIMIVL:
27382730
{
27392731
hparams.rope_theta = 10000.0f;
2740-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2732+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27412733
// TODO: check kimivl preprocessor for exact values
27422734
hparams.set_limit_image_tokens(8, 1024);
27432735
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
@@ -2746,17 +2738,16 @@ struct clip_model_loader {
27462738
{
27472739
// default value (used by all model sizes in gemma 3 family)
27482740
// number of patches for each **side** is reduced by a factor of 4
2749-
hparams.proj_scale_factor = 4;
2741+
hparams.n_merge = 4;
27502742
// test model (tinygemma3) has a different value, we optionally read it
2751-
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2743+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27522744
} break;
27532745
case PROJECTOR_TYPE_QWEN2VL:
27542746
case PROJECTOR_TYPE_QWEN25VL:
27552747
case PROJECTOR_TYPE_QWEN3VL:
27562748
{
2757-
int spatial_merge = 2;
2758-
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
2759-
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2749+
hparams.n_merge = 2; // default value for Qwen 2 and 2.5
2750+
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
27602751
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
27612752
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
27622753
// the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
@@ -2768,10 +2759,7 @@ struct clip_model_loader {
27682759
case PROJECTOR_TYPE_LLAMA4:
27692760
{
27702761
hparams.rope_theta = 10000.0f;
2771-
// correct non-standard proj_scale_factor value
2772-
int spatial_merge = 2;
2773-
get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
2774-
hparams.proj_scale_factor = spatial_merge * spatial_merge;
2762+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
27752763
set_llava_uhd_res_candidates(model, 3);
27762764
} break;
27772765
case PROJECTOR_TYPE_ULTRAVOX:
@@ -2791,14 +2779,6 @@ struct clip_model_loader {
27912779
break;
27922780
}
27932781

2794-
// sanity check
2795-
{
2796-
if (hparams.proj_scale_factor) {
2797-
const int n_merge = hparams.get_merge_kernel_size();
2798-
GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor);
2799-
}
2800-
}
2801-
28022782
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
28032783
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
28042784
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
@@ -2812,11 +2792,8 @@ struct clip_model_loader {
28122792
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
28132793
LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
28142794
LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
2815-
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2795+
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
28162796
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
2817-
if (hparams.proj_scale_factor > 0) {
2818-
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2819-
}
28202797
if (hparams.image_min_pixels > 0) {
28212798
LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels);
28222799
}
@@ -4048,7 +4025,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40484025
clip_image_u8 canvas;
40494026
const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
40504027
original_size,
4051-
params.patch_size * params.get_merge_kernel_size(),
4028+
params.patch_size * params.n_merge,
40524029
params.image_min_pixels,
40534030
params.image_max_pixels);
40544031
canvas.nx = canvas_size.width;
@@ -4145,9 +4122,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41454122
{
41464123
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
41474124
clip_image_u8 resized_image;
4125+
// the original pixtral model doesn't have n_merge
4126+
const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
41484127
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41494128
original_size,
4150-
params.patch_size * params.get_merge_kernel_size(),
4129+
params.patch_size * cur_merge,
41514130
params.image_min_pixels,
41524131
params.image_max_pixels);
41534132
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4178,7 +4157,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41784157
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
41794158
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41804159
original_size,
4181-
params.patch_size * params.get_merge_kernel_size(),
4160+
params.patch_size * params.n_merge,
41824161
params.image_min_pixels,
41834162
params.image_max_pixels);
41844163
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4366,15 +4345,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43664345
case PROJECTOR_TYPE_LLAMA4:
43674346
{
43684347
// both X and Y are downscaled by the scale factor
4369-
int scale_factor = ctx->model.hparams.proj_scale_factor;
4348+
int scale_factor = ctx->model.hparams.n_merge;
43704349
n_patches /= (scale_factor * scale_factor);
43714350
} break;
43724351
case PROJECTOR_TYPE_LFM2:
43734352
case PROJECTOR_TYPE_KIMIVL:
43744353
{
43754354
// dynamic size
4376-
int scale_factor = ctx->model.hparams.get_merge_kernel_size();
4377-
int out_patch_size = params.patch_size * scale_factor;
4355+
int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
43784356
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
43794357
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
43804358
n_patches = x_patch * y_patch;
@@ -4383,7 +4361,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43834361
case PROJECTOR_TYPE_LIGHTONOCR:
43844362
{
43854363
// dynamic size
4386-
int n_merge = params.get_merge_kernel_size();
4364+
int n_merge = ctx->model.hparams.n_merge;
43874365
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
43884366
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
43894367
if (ctx->model.token_embd_img_break) {

0 commit comments

Comments
 (0)