@@ -171,7 +171,9 @@ struct clip_hparams {
171171 int32_t n_head;
172172 int32_t n_layer;
173173 // idefics3
174- int32_t preproc_image_size = 0 ; // aka max_dimension
174+ int32_t image_longest_edge = 0 ;
175+ int32_t image_min_pixels = 0 ;
176+ int32_t image_max_pixels = 0 ;
175177 int32_t proj_scale_factor = 0 ;
176178
177179 float image_mean[3 ];
@@ -204,6 +206,13 @@ struct clip_hparams {
204206 bool has_llava_projector = false ;
205207 int minicpmv_version = 0 ;
206208 int32_t minicpmv_query_num = 0 ; // MiniCPM-V query number
209+
210+ // used by LFM2 and KIMI-VL
211+ void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
212+ const int total_factor = patch_size * proj_scale_factor;
213+ image_min_pixels = n_tokens_min * total_factor * total_factor;
214+ image_max_pixels = n_tokens_max * total_factor * total_factor;
215+ }
207216};
208217
209218struct clip_layer {
@@ -2577,7 +2586,7 @@ struct clip_model_loader {
25772586
25782587 if (is_vision) {
25792588 get_u32 (KEY_IMAGE_SIZE, hparams.image_size );
2580- get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size , false );
2589+ get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge , false );
25812590 get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
25822591 get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
25832592 get_i32 (KEY_MINICPMV_VERSION, hparams.minicpmv_version , false ); // legacy
@@ -2686,26 +2695,29 @@ struct clip_model_loader {
26862695 hparams.minicpmv_version = 2 ; // default to 2 if not set
26872696 }
26882697 } break ;
2698+ case PROJECTOR_TYPE_INTERNVL:
2699+ {
2700+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2701+ } break ;
26892702 case PROJECTOR_TYPE_IDEFICS3:
26902703 case PROJECTOR_TYPE_LFM2:
2691- case PROJECTOR_TYPE_INTERNVL:
26922704 {
2705+ hparams.set_limit_image_tokens (64 , 1024 );
26932706 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
26942707 } break ;
26952708 case PROJECTOR_TYPE_PIXTRAL:
26962709 case PROJECTOR_TYPE_LIGHTONOCR:
26972710 {
26982711 hparams.rope_theta = 10000 .0f ;
26992712 hparams.warmup_image_size = hparams.patch_size * 8 ;
2700- // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
2701- // ref: https://github.com/ggml-org/llama.cpp/issues/14310
2702- hparams.image_size = 1024 ;
2713+ hparams.set_limit_image_tokens (64 , 1024 );
27032714 get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
27042715 } break ;
27052716 case PROJECTOR_TYPE_KIMIVL:
27062717 {
27072718 hparams.rope_theta = 10000 .0f ;
27082719 hparams.warmup_image_size = hparams.patch_size * 8 ;
2720+ hparams.set_limit_image_tokens (64 , 1024 );
27092721 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27102722 } break ;
27112723 case PROJECTOR_TYPE_GEMMA3:
@@ -3494,14 +3506,14 @@ struct img_tool {
34943506
34953507 // calculate the size of the **resized** image, while preserving the aspect ratio
34963508 // the calculated size will be aligned to the nearest multiple of align_size
3497- // if H or W size is larger than max_dimension , it will be resized to max_dimension
3498- static clip_image_size calc_size_preserved_ratio (const clip_image_size & inp_size, const int align_size, const int max_dimension ) {
3499- if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0 ) {
3509+ // if H or W size is larger than longest_edge , it will be resized to longest_edge
3510+ static clip_image_size calc_size_preserved_ratio (const clip_image_size & inp_size, const int align_size, const int longest_edge ) {
3511+ if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0 ) {
35003512 return {0 , 0 };
35013513 }
35023514
3503- float scale = std::min (static_cast <float >(max_dimension ) / inp_size.width ,
3504- static_cast <float >(max_dimension ) / inp_size.height );
3515+ float scale = std::min (static_cast <float >(longest_edge ) / inp_size.width ,
3516+ static_cast <float >(longest_edge ) / inp_size.height );
35053517
35063518 float target_width_f = static_cast <float >(inp_size.width ) * scale;
35073519 float target_height_f = static_cast <float >(inp_size.height ) * scale;
@@ -3512,6 +3524,33 @@ struct img_tool {
35123524 return {aligned_width, aligned_height};
35133525 }
35143526
3527+ // calculate the size of the **resized** image, while preserving the aspect ratio
3528+ // the calculated size will have min_pixels <= W*H <= max_pixels
3529+ // this is referred as "smart_resize" in transformers code
3530+ static clip_image_size calc_size_preserved_ratio (const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3531+ const int width = inp_size.width ;
3532+ const int height = inp_size.height ;
3533+
3534+ auto round_by_factor = [f = align_size](float x) { return static_cast <int >(std::nearbyintf (x / static_cast <float >(f))) * f; };
3535+ auto ceil_by_factor = [f = align_size](float x) { return static_cast <int >(std::ceil (x / static_cast <float >(f))) * f; };
3536+ auto floor_by_factor = [f = align_size](float x) { return static_cast <int >(std::floor (x / static_cast <float >(f))) * f; };
3537+
3538+ int h_bar = std::max (align_size, round_by_factor (height));
3539+ int w_bar = std::max (align_size, round_by_factor (width));
3540+
3541+ if (h_bar * w_bar > max_pixels) {
3542+ const auto beta = std::sqrt (static_cast <float >(height * width) / max_pixels);
3543+ h_bar = std::max (align_size, floor_by_factor (height / beta));
3544+ w_bar = std::max (align_size, floor_by_factor (width / beta));
3545+ } else if (h_bar * w_bar < min_pixels) {
3546+ const auto beta = std::sqrt (static_cast <float >(min_pixels) / (height * width));
3547+ h_bar = ceil_by_factor (height * beta);
3548+ w_bar = ceil_by_factor (width * beta);
3549+ }
3550+
3551+ return {w_bar, h_bar};
3552+ }
3553+
35153554private:
35163555 // draw src image into dst image at offset (offset_x, offset_y)
35173556 static void draw_into (clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
@@ -3982,7 +4021,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
39824021 //
39834022 // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
39844023 const clip_image_size refined_size = img_tool::calc_size_preserved_ratio (
3985- original_size, params.image_size , params.preproc_image_size );
4024+ original_size, params.image_size , params.image_longest_edge );
39864025 // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
39874026 // __func__, original_size.width, original_size.height,
39884027 // refined_size.width, refined_size.height);
@@ -4064,37 +4103,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40644103 case PROJECTOR_TYPE_LFM2:
40654104 case PROJECTOR_TYPE_KIMIVL:
40664105 {
4067- GGML_ASSERT (params.proj_scale_factor );
4068- // smart resize
4069- const int width = img->nx ;
4070- const int height = img->ny ;
4071- const int total_factor = params.patch_size * params.proj_scale_factor ;
4072- constexpr int min_image_tokens = 64 ;
4073- constexpr int max_image_tokens = 1024 ;
4074- const float min_pixels = min_image_tokens * total_factor * total_factor;
4075- const float max_pixels = max_image_tokens * total_factor * total_factor;
4076-
4077- auto round_by_factor = [f = total_factor](float x) { return static_cast <int >(std::nearbyintf (x / static_cast <float >(f))) * f; };
4078- auto ceil_by_factor = [f = total_factor](float x) { return static_cast <int >(std::ceil (x / static_cast <float >(f))) * f; };
4079- auto floor_by_factor = [f = total_factor](float x) { return static_cast <int >(std::floor (x / static_cast <float >(f))) * f; };
4080-
4081- int h_bar = std::max (total_factor, round_by_factor (height));
4082- int w_bar = std::max (total_factor, round_by_factor (width));
4083-
4084- if (h_bar * w_bar > max_pixels) {
4085- const auto beta = std::sqrt ((height * width) / max_pixels);
4086- h_bar = std::max (total_factor, floor_by_factor (height / beta));
4087- w_bar = std::max (total_factor, floor_by_factor (width / beta));
4088- } else if (h_bar * w_bar < min_pixels) {
4089- const auto beta = std::sqrt (min_pixels / (height * width));
4090- h_bar = ceil_by_factor (height * beta);
4091- w_bar = ceil_by_factor (width * beta);
4092- }
4093-
4106+ const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
4107+ original_size,
4108+ params.patch_size * params.proj_scale_factor ,
4109+ params.image_min_pixels ,
4110+ params.image_max_pixels );
40944111 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
40954112
40964113 clip_image_u8 resized_img;
4097- img_tool::resize (*img, resized_img, clip_image_size{w_bar, h_bar} , img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
4114+ img_tool::resize (*img, resized_img, target_size , img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
40984115 clip_image_f32_ptr res (clip_image_f32_init ());
40994116 normalize_image_u8_to_f32 (resized_img, *res, params.image_mean , params.image_std );
41004117 res_imgs->entries .push_back (std::move (res));
0 commit comments