@@ -174,7 +174,7 @@ struct clip_hparams {
174174 int32_t image_longest_edge = 0 ;
175175 int32_t image_min_pixels = 0 ;
176176 int32_t image_max_pixels = 0 ;
177- int32_t proj_scale_factor = 0 ; // = (spatial_merge_size)^2
177+ int32_t n_merge = 0 ; // number of patch merges **per-side**
178178
179179 float image_mean[3 ];
180180 float image_std[3 ];
@@ -207,7 +207,8 @@ struct clip_hparams {
207207 int32_t minicpmv_query_num = 0 ; // MiniCPM-V query number
208208
209209 void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
210- const int patch_area = patch_size * patch_size * proj_scale_factor;
210+ const int cur_merge = n_merge == 0 ? 1 : n_merge;
211+ const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
211212 image_min_pixels = n_tokens_min * patch_area;
212213 image_max_pixels = n_tokens_max * patch_area;
213214 warmup_image_size = static_cast <int >(std::sqrt (image_max_pixels));
@@ -216,11 +217,8 @@ struct clip_hparams {
216217 void set_warmup_n_tokens (int n_tokens) {
217218 int n_tok_per_side = static_cast <int >(std::sqrt (n_tokens));
218219 GGML_ASSERT (n_tok_per_side * n_tok_per_side == n_tokens && " n_tokens must be n*n" );
219- warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size ();
220- }
221-
222- int get_merge_kernel_size () const {
223- return static_cast <int >(std::sqrt (proj_scale_factor));
220+ const int cur_merge = n_merge == 0 ? 1 : n_merge;
221+ warmup_image_size = n_tok_per_side * patch_size * cur_merge;
224222 }
225223};
226224
@@ -550,7 +548,7 @@ struct clip_graph {
550548 const int batch_size = 1 ;
551549 GGML_ASSERT (n_patches_x == n_patches_y);
552550 const int patches_per_image = n_patches_x;
553- const int kernel_size = hparams.proj_scale_factor ;
551+ const int kernel_size = hparams.n_merge ;
554552
555553 cur = ggml_transpose (ctx0, cur);
556554 cur = ggml_cont_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -572,13 +570,13 @@ struct clip_graph {
572570 } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
573571 // pixel_shuffle
574572 // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
575- const int scale_factor = model.hparams .proj_scale_factor ;
573+ const int scale_factor = model.hparams .n_merge ;
576574 cur = build_patch_merge_permute (cur, scale_factor);
577575 cur = ggml_mul_mat (ctx0, model.projection , cur);
578576
579577 } else if (ctx->proj_type () == PROJECTOR_TYPE_LFM2) {
580578 // pixel unshuffle block
581- const int scale_factor = model.hparams .get_merge_kernel_size () ;
579+ const int scale_factor = model.hparams .n_merge ;
582580 cur = build_patch_merge_permute (cur, scale_factor);
583581
584582 // projection
@@ -602,7 +600,7 @@ struct clip_graph {
602600 }
603601
604602 ggml_cgraph * build_pixtral () {
605- const int n_merge = hparams.get_merge_kernel_size () ;
603+ const int n_merge = hparams.n_merge ;
606604
607605 // 2D input positions
608606 ggml_tensor * pos_h = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
@@ -628,7 +626,7 @@ struct clip_graph {
628626 // mistral small 3.1 patch merger
629627 // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
630628 if (model.mm_patch_merger_w ) {
631- GGML_ASSERT (hparams.proj_scale_factor > 0 );
629+ GGML_ASSERT (hparams.n_merge > 0 );
632630
633631 cur = ggml_mul (ctx0, ggml_rms_norm (ctx0, cur, eps), model.mm_input_norm_w );
634632
@@ -944,8 +942,7 @@ struct clip_graph {
944942
945943 // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
946944 ggml_tensor * deepstack_features = nullptr ;
947- const int merge_factor = hparams.proj_scale_factor > 0
948- ? hparams.proj_scale_factor : 4 ; // default 2x2=4 for qwen3vl
945+ const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4 ; // default 2x2=4 for qwen3vl
949946
950947 // loop over layers
951948 for (int il = 0 ; il < n_layer; il++) {
@@ -1168,7 +1165,7 @@ struct clip_graph {
11681165
11691166 // pixel shuffle
11701167 {
1171- const int scale_factor = model.hparams .proj_scale_factor ;
1168+ const int scale_factor = model.hparams .n_merge ;
11721169 const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
11731170 const int height = n_patches_y;
11741171 const int width = n_patches_x;
@@ -1258,7 +1255,7 @@ struct clip_graph {
12581255 // based on Llama4VisionPixelShuffleMLP
12591256 // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
12601257 {
1261- const int scale_factor = model.hparams .proj_scale_factor ;
1258+ const int scale_factor = model.hparams .n_merge ;
12621259 const int bsz = 1 ; // batch size, always 1 for now since we don't support batching
12631260 GGML_ASSERT (scale_factor > 0 );
12641261 GGML_ASSERT (n_patches_x == n_patches_y); // llama4 only supports square images
@@ -1330,7 +1327,7 @@ struct clip_graph {
13301327
13311328 {
13321329 // patch_merger
1333- const int scale_factor = model.hparams .proj_scale_factor ;
1330+ const int scale_factor = model.hparams .n_merge ;
13341331 cur = build_patch_merge_permute (cur, scale_factor);
13351332
13361333 // projection norm
@@ -2706,19 +2703,16 @@ struct clip_model_loader {
27062703 } break ;
27072704 case PROJECTOR_TYPE_INTERNVL:
27082705 {
2709- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2706+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27102707 } break ;
27112708 case PROJECTOR_TYPE_IDEFICS3:
27122709 {
2713- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2710+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27142711 get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge , false );
27152712 } break ;
27162713 case PROJECTOR_TYPE_LFM2:
27172714 {
2718- // correct non-standard proj_scale_factor value
2719- int spatial_merge = 2 ;
2720- get_u32 (KEY_PROJ_SCALE_FACTOR, spatial_merge, false );
2721- hparams.proj_scale_factor = spatial_merge * spatial_merge;
2715+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27222716 // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
27232717 hparams.set_limit_image_tokens (64 , 256 );
27242718 } break ;
@@ -2728,16 +2722,14 @@ struct clip_model_loader {
27282722 // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
27292723 // TODO: verify the image_min_tokens
27302724 hparams.rope_theta = 10000 .0f ;
2731- int spatial_merge = 2 ;
2732- get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
2733- hparams.proj_scale_factor = spatial_merge * spatial_merge;
2725+ get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.n_merge , false );
27342726 hparams.set_limit_image_tokens (8 , 1024 );
27352727 hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27362728 } break ;
27372729 case PROJECTOR_TYPE_KIMIVL:
27382730 {
27392731 hparams.rope_theta = 10000 .0f ;
2740- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2732+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27412733 // TODO: check kimivl preprocessor for exact values
27422734 hparams.set_limit_image_tokens (8 , 1024 );
27432735 hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
@@ -2746,17 +2738,16 @@ struct clip_model_loader {
27462738 {
27472739 // default value (used by all model sizes in gemma 3 family)
27482740 // number of patches for each **side** is reduced by a factor of 4
2749- hparams.proj_scale_factor = 4 ;
2741+ hparams.n_merge = 4 ;
27502742 // test model (tinygemma3) has a different value, we optionally read it
2751- get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2743+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27522744 } break ;
27532745 case PROJECTOR_TYPE_QWEN2VL:
27542746 case PROJECTOR_TYPE_QWEN25VL:
27552747 case PROJECTOR_TYPE_QWEN3VL:
27562748 {
2757- int spatial_merge = 2 ;
2758- get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
2759- hparams.proj_scale_factor = spatial_merge * spatial_merge;
2749+ hparams.n_merge = 2 ; // default value for Qwen 2 and 2.5
2750+ get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.n_merge , false );
27602751 get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
27612752 // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
27622753 // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
@@ -2768,10 +2759,7 @@ struct clip_model_loader {
27682759 case PROJECTOR_TYPE_LLAMA4:
27692760 {
27702761 hparams.rope_theta = 10000 .0f ;
2771- // correct non-standard proj_scale_factor value
2772- int spatial_merge = 2 ;
2773- get_u32 (KEY_PROJ_SCALE_FACTOR, spatial_merge, false );
2774- hparams.proj_scale_factor = spatial_merge * spatial_merge;
2762+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.n_merge , false );
27752763 set_llava_uhd_res_candidates (model, 3 );
27762764 } break ;
27772765 case PROJECTOR_TYPE_ULTRAVOX:
@@ -2791,14 +2779,6 @@ struct clip_model_loader {
27912779 break ;
27922780 }
27932781
2794- // sanity check
2795- {
2796- if (hparams.proj_scale_factor ) {
2797- const int n_merge = hparams.get_merge_kernel_size ();
2798- GGML_ASSERT (n_merge * n_merge == hparams.proj_scale_factor );
2799- }
2800- }
2801-
28022782 LOG_INF (" %s: projector: %s\n " , __func__, proj_type.c_str ());
28032783 LOG_INF (" %s: n_embd: %d\n " , __func__, hparams.n_embd );
28042784 LOG_INF (" %s: n_head: %d\n " , __func__, hparams.n_head );
@@ -2812,11 +2792,8 @@ struct clip_model_loader {
28122792 LOG_INF (" %s: patch_size: %d\n " , __func__, hparams.patch_size );
28132793 LOG_INF (" %s: has_llava_proj: %d\n " , __func__, hparams.has_llava_projector );
28142794 LOG_INF (" %s: minicpmv_version: %d\n " , __func__, hparams.minicpmv_version );
2815- LOG_INF (" %s: proj_scale_factor : %d\n " , __func__, hparams.proj_scale_factor );
2795+ LOG_INF (" %s: n_merge : %d\n " , __func__, hparams.n_merge );
28162796 LOG_INF (" %s: n_wa_pattern: %d\n " , __func__, hparams.n_wa_pattern );
2817- if (hparams.proj_scale_factor > 0 ) {
2818- LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
2819- }
28202797 if (hparams.image_min_pixels > 0 ) {
28212798 LOG_INF (" %s: image_min_pixels: %d\n " , __func__, hparams.image_min_pixels );
28222799 }
@@ -4048,7 +4025,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40484025 clip_image_u8 canvas;
40494026 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio (
40504027 original_size,
4051- params.patch_size * params.get_merge_kernel_size () ,
4028+ params.patch_size * params.n_merge ,
40524029 params.image_min_pixels ,
40534030 params.image_max_pixels );
40544031 canvas.nx = canvas_size.width ;
@@ -4145,9 +4122,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41454122 {
41464123 GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
41474124 clip_image_u8 resized_image;
4125+ // the original pixtral model doesn't have n_merge
4126+ const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge ;
41484127 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41494128 original_size,
4150- params.patch_size * params. get_merge_kernel_size () ,
4129+ params.patch_size * cur_merge ,
41514130 params.image_min_pixels ,
41524131 params.image_max_pixels );
41534132 img_tool::resize (*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4178,7 +4157,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41784157 GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
41794158 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41804159 original_size,
4181- params.patch_size * params.get_merge_kernel_size () ,
4160+ params.patch_size * params.n_merge ,
41824161 params.image_min_pixels ,
41834162 params.image_max_pixels );
41844163 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
@@ -4366,15 +4345,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43664345 case PROJECTOR_TYPE_LLAMA4:
43674346 {
43684347 // both X and Y are downscaled by the scale factor
4369- int scale_factor = ctx->model .hparams .proj_scale_factor ;
4348+ int scale_factor = ctx->model .hparams .n_merge ;
43704349 n_patches /= (scale_factor * scale_factor);
43714350 } break ;
43724351 case PROJECTOR_TYPE_LFM2:
43734352 case PROJECTOR_TYPE_KIMIVL:
43744353 {
43754354 // dynamic size
4376- int scale_factor = ctx->model .hparams .get_merge_kernel_size ();
4377- int out_patch_size = params.patch_size * scale_factor;
4355+ int out_patch_size = params.patch_size * ctx->model .hparams .n_merge ;
43784356 int x_patch = CLIP_ALIGN (img->nx , out_patch_size) / out_patch_size;
43794357 int y_patch = CLIP_ALIGN (img->ny , out_patch_size) / out_patch_size;
43804358 n_patches = x_patch * y_patch;
@@ -4383,7 +4361,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43834361 case PROJECTOR_TYPE_LIGHTONOCR:
43844362 {
43854363 // dynamic size
4386- int n_merge = params. get_merge_kernel_size () ;
4364+ int n_merge = ctx-> model . hparams . n_merge ;
43874365 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
43884366 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
43894367 if (ctx->model .token_embd_img_break ) {
0 commit comments