@@ -169,8 +169,8 @@ struct clip_hparams {
169169 int32_t n_layer;
170170 // idefics3
171171 int32_t image_longest_edge = 0 ;
172- int32_t image_min_pixels = 0 ;
173- int32_t image_max_pixels = 0 ;
172+ int32_t image_min_pixels = - 1 ;
173+ int32_t image_max_pixels = - 1 ;
174174 int32_t n_merge = 0 ; // number of patch merges **per-side**
175175
176176 float image_mean[3 ];
@@ -203,11 +203,15 @@ struct clip_hparams {
203203 int minicpmv_version = 0 ;
204204 int32_t minicpmv_query_num = 0 ; // MiniCPM-V query number
205205
206+ // custom value provided by user, can be undefined if not set
207+ int32_t custom_image_min_tokens = -1 ;
208+ int32_t custom_image_max_tokens = -1 ;
209+
206210 void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
207211 const int cur_merge = n_merge == 0 ? 1 : n_merge;
208212 const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
209- image_min_pixels = n_tokens_min * patch_area;
210- image_max_pixels = n_tokens_max * patch_area;
213+ image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
214+ image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
211215 warmup_image_size = static_cast <int >(std::sqrt (image_max_pixels));
212216 }
213217
@@ -216,6 +220,7 @@ struct clip_hparams {
216220 GGML_ASSERT (n_tok_per_side * n_tok_per_side == n_tokens && " n_tokens must be n*n" );
217221 const int cur_merge = n_merge == 0 ? 1 : n_merge;
218222 warmup_image_size = n_tok_per_side * patch_size * cur_merge;
223+ // TODO: support warmup size for custom token numbers
219224 }
220225};
221226
@@ -459,6 +464,13 @@ struct clip_ctx {
459464 LOG_INF (" %s: CLIP using CPU backend\n " , __func__);
460465 }
461466
467+ if (ctx_params.image_min_tokens > 0 ) {
468+ model.hparams .custom_image_min_tokens = ctx_params.image_min_tokens ;
469+ }
470+ if (ctx_params.image_max_tokens > 0 ) {
471+ model.hparams .custom_image_max_tokens = ctx_params.image_max_tokens ;
472+ }
473+
462474 backend_ptrs.push_back (backend_cpu);
463475 backend_buft.push_back (ggml_backend_get_default_buffer_type (backend_cpu));
464476
@@ -2786,6 +2798,12 @@ struct clip_model_loader {
27862798 // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
27872799 hparams.set_limit_image_tokens (8 , 2048 );
27882800 hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
2801+ const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size ;
2802+ if (hparams.image_min_pixels < warn_min_pixels) {
2803+ LOG_WRN (" %s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n " , __func__);
2804+ LOG_WRN (" %s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n " , __func__);
2805+ LOG_WRN (" %s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n " , __func__);
2806+ }
27892807 } break ;
27902808 case PROJECTOR_TYPE_LLAMA4:
27912809 {
@@ -2810,6 +2828,13 @@ struct clip_model_loader {
28102828 break ;
28112829 }
28122830
2831+ // sanity check
2832+ {
2833+ if (hparams.image_max_pixels < hparams.image_min_pixels ) {
2834+ throw std::runtime_error (string_format (" %s: image_max_pixels (%d) is less than image_min_pixels (%d)\n " , __func__, hparams.image_max_pixels , hparams.image_min_pixels ));
2835+ }
2836+ }
2837+
28132838 LOG_INF (" %s: projector: %s\n " , __func__, proj_type.c_str ());
28142839 LOG_INF (" %s: n_embd: %d\n " , __func__, hparams.n_embd );
28152840 LOG_INF (" %s: n_head: %d\n " , __func__, hparams.n_head );
@@ -2826,10 +2851,10 @@ struct clip_model_loader {
28262851 LOG_INF (" %s: n_merge: %d\n " , __func__, hparams.n_merge );
28272852 LOG_INF (" %s: n_wa_pattern: %d\n " , __func__, hparams.n_wa_pattern );
28282853 if (hparams.image_min_pixels > 0 ) {
2829- LOG_INF (" %s: image_min_pixels: %d\n " , __func__, hparams.image_min_pixels );
2854+ LOG_INF (" %s: image_min_pixels: %d%s \n " , __func__, hparams.image_min_pixels , hparams. custom_image_min_tokens > 0 ? " (custom value) " : " " );
28302855 }
28312856 if (hparams.image_max_pixels > 0 ) {
2832- LOG_INF (" %s: image_max_pixels: %d\n " , __func__, hparams.image_max_pixels );
2857+ LOG_INF (" %s: image_max_pixels: %d%s \n " , __func__, hparams.image_max_pixels , hparams. custom_image_max_tokens > 0 ? " (custom value) " : " " );
28332858 }
28342859 } else if (is_audio) {
28352860 LOG_INF (" \n --- audio hparams ---\n " );
@@ -4169,7 +4194,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41694194 case PROJECTOR_TYPE_QWEN25VL:
41704195 case PROJECTOR_TYPE_QWEN3VL:
41714196 {
4172- // step 1: make a blank canvas which aligns to the grid
4197+ GGML_ASSERT (params. image_min_pixels > 0 && params. image_max_pixels > 0 );
41734198 clip_image_u8 resized;
41744199 const clip_image_size new_size = img_tool::calc_size_preserved_ratio (
41754200 original_size,
@@ -4262,7 +4287,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
42624287 case PROJECTOR_TYPE_PIXTRAL:
42634288 case PROJECTOR_TYPE_LIGHTONOCR:
42644289 {
4265- GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
4290+ GGML_ASSERT (params.image_min_pixels > 0 && params.image_max_pixels > 0 );
42664291 clip_image_u8 resized_image;
42674292 // the original pixtral model doesn't have n_merge
42684293 const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge ;
@@ -4296,7 +4321,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
42964321 case PROJECTOR_TYPE_LFM2:
42974322 case PROJECTOR_TYPE_KIMIVL:
42984323 {
4299- GGML_ASSERT (params.image_min_pixels && params.image_max_pixels );
4324+ GGML_ASSERT (params.image_min_pixels > 0 && params.image_max_pixels > 0 );
43004325 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
43014326 original_size,
43024327 params.patch_size * params.n_merge ,
0 commit comments