@@ -174,7 +174,7 @@ struct clip_hparams {
174174 int32_t image_longest_edge = 0 ;
175175 int32_t image_min_pixels = 0 ;
176176 int32_t image_max_pixels = 0 ;
177- int32_t proj_scale_factor = 0 ;
177+ int32_t proj_scale_factor = 0 ; // = (spatial_merge_size)^2
178178
179179 float image_mean[3 ];
180180 float image_std[3 ];
@@ -196,7 +196,6 @@ struct clip_hparams {
196196 std::unordered_set<int32_t > vision_feature_layer;
197197 int32_t attn_window_size = 0 ;
198198 int32_t n_wa_pattern = 0 ;
199- int32_t spatial_merge_size = 0 ;
200199
201200 // audio
202201 int32_t n_mel_bins = 0 ; // whisper preprocessor
@@ -209,9 +208,16 @@ struct clip_hparams {
209208
210209 // used by LFM2 and KIMI-VL
211210 void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
212- const int total_factor = patch_size * proj_scale_factor;
213- image_min_pixels = n_tokens_min * total_factor * total_factor;
214- image_max_pixels = n_tokens_max * total_factor * total_factor;
211+ const int patch_area = patch_size * patch_size * proj_scale_factor;
212+ image_min_pixels = n_tokens_min * patch_area;
213+ image_max_pixels = n_tokens_max * patch_area;
214+ warmup_image_size = static_cast <int >(std::sqrt (image_max_pixels));
215+ }
216+
217+ void set_warmup_n_tokens (int n_tokens) {
218+ int n_tok_per_side = static_cast <int >(std::sqrt (n_tokens));
219+ GGML_ASSERT (n_tok_per_side * n_tok_per_side == n_tokens && " n_tokens must be n*n" );
220+ warmup_image_size = n_tok_per_side * patch_size * static_cast <int >(std::sqrt (proj_scale_factor));
215221 }
216222};
217223
@@ -593,7 +599,7 @@ struct clip_graph {
593599 }
594600
595601 ggml_cgraph * build_pixtral () {
596- const int n_merge = hparams.spatial_merge_size ;
602+ const int n_merge = hparams.proj_scale_factor ;
597603
598604 // 2D input positions
599605 ggml_tensor * pos_h = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
@@ -619,7 +625,7 @@ struct clip_graph {
619625 // mistral small 3.1 patch merger
620626 // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
621627 if (model.mm_patch_merger_w ) {
622- GGML_ASSERT (hparams.spatial_merge_size > 0 );
628+ GGML_ASSERT (hparams.proj_scale_factor > 0 );
623629
624630 cur = ggml_mul (ctx0, ggml_rms_norm (ctx0, cur, eps), model.mm_input_norm_w );
625631
@@ -935,7 +941,7 @@ struct clip_graph {
935941
936942 // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
937943 ggml_tensor * deepstack_features = nullptr ;
938- const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4 ; // default 2x2=4 for qwen3vl
944+ const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4 ; // default 2x2=4 for qwen3vl
939945
940946 // loop over layers
941947 for (int il = 0 ; il < n_layer; il++) {
@@ -2700,25 +2706,32 @@ struct clip_model_loader {
27002706 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27012707 } break ;
27022708 case PROJECTOR_TYPE_IDEFICS3:
2709+ {
2710+ hparams.set_limit_image_tokens (8 , 1024 );
2711+ hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
2712+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2713+ } break ;
27032714 case PROJECTOR_TYPE_LFM2:
27042715 {
2705- hparams.set_limit_image_tokens (64 , 1024 );
2716+ hparams.set_limit_image_tokens (8 , 256 );
27062717 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27072718 } break ;
27082719 case PROJECTOR_TYPE_PIXTRAL:
27092720 case PROJECTOR_TYPE_LIGHTONOCR:
27102721 {
27112722 hparams.rope_theta = 10000 .0f ;
2712- hparams.warmup_image_size = hparams.patch_size * 8 ;
2713- hparams.set_limit_image_tokens (64 , 1024 );
2714- get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
2723+ int spatial_merge = 2 ;
2724+ get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
2725+ hparams.proj_scale_factor = spatial_merge * spatial_merge;
2726+ hparams.set_limit_image_tokens (8 , 1024 );
2727+ hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27152728 } break ;
27162729 case PROJECTOR_TYPE_KIMIVL:
27172730 {
27182731 hparams.rope_theta = 10000 .0f ;
2719- hparams.warmup_image_size = hparams.patch_size * 8 ;
2720- hparams.set_limit_image_tokens (64 , 1024 );
27212732 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2733+ hparams.set_limit_image_tokens (8 , 1024 );
2734+ hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27222735 } break ;
27232736 case PROJECTOR_TYPE_GEMMA3:
27242737 {
@@ -2729,29 +2742,15 @@ struct clip_model_loader {
27292742 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27302743 } break ;
27312744 case PROJECTOR_TYPE_QWEN2VL:
2732- {
2733- // max image size = sqrt(max_pixels) = 3584
2734- // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
2735- // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2736- // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2737- hparams.image_size = 1024 ;
2738- hparams.warmup_image_size = hparams.patch_size * 8 ;
2739- } break ;
27402745 case PROJECTOR_TYPE_QWEN25VL:
2741- {
2742- // max image size = sqrt(max_pixels)
2743- // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2744- // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
2745- // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
2746- hparams.image_size = 1024 ;
2747- hparams.warmup_image_size = hparams.patch_size * 8 ;
2748- get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern );
2749- } break ;
27502746 case PROJECTOR_TYPE_QWEN3VL:
27512747 {
2752- hparams.image_size = 1024 ; // still need this?
2753- hparams.warmup_image_size = hparams.patch_size * 8 ;
2754- get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
2748+ int spatial_merge = 2 ;
2749+ get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
2750+ hparams.proj_scale_factor = spatial_merge * spatial_merge;
2751+ get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2752+ hparams.set_limit_image_tokens (8 , 1024 );
2753+ hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27552754 } break ;
27562755 case PROJECTOR_TYPE_LLAMA4:
27572756 {
@@ -2791,8 +2790,8 @@ struct clip_model_loader {
27912790 LOG_INF (" %s: minicpmv_version: %d\n " , __func__, hparams.minicpmv_version );
27922791 LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
27932792 LOG_INF (" %s: n_wa_pattern: %d\n " , __func__, hparams.n_wa_pattern );
2794- if (hparams.spatial_merge_size > 0 ) {
2795- LOG_INF (" %s: spatial_merge_size : %d\n " , __func__, hparams.spatial_merge_size );
2793+ if (hparams.proj_scale_factor > 0 ) {
2794+ LOG_INF (" %s: proj_scale_factor : %d\n " , __func__, hparams.proj_scale_factor );
27962795 }
27972796 } else if (is_audio) {
27982797 LOG_INF (" \n --- audio hparams ---\n " );
@@ -4310,7 +4309,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43104309 case PROJECTOR_TYPE_LIGHTONOCR:
43114310 {
43124311 // dynamic size
4313- int n_merge = params.spatial_merge_size ;
4312+ int n_merge = params.proj_scale_factor ;
43144313 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1 );
43154314 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1 );
43164315 if (ctx->model .token_embd_img_break ) {
0 commit comments