@@ -206,7 +206,6 @@ struct clip_hparams {
206206 int minicpmv_version = 0 ;
207207 int32_t minicpmv_query_num = 0 ; // MiniCPM-V query number
208208
209- // used by LFM2 and KIMI-VL
210209 void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
211210 const int patch_area = patch_size * patch_size * proj_scale_factor;
212211 image_min_pixels = n_tokens_min * patch_area;
@@ -2592,7 +2591,6 @@ struct clip_model_loader {
25922591
25932592 if (is_vision) {
25942593 get_u32 (KEY_IMAGE_SIZE, hparams.image_size );
2595- get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge , false );
25962594 get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
25972595 get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
25982596 get_i32 (KEY_MINICPMV_VERSION, hparams.minicpmv_version , false ); // legacy
@@ -2707,18 +2705,20 @@ struct clip_model_loader {
27072705 } break ;
27082706 case PROJECTOR_TYPE_IDEFICS3:
27092707 {
2710- hparams.set_limit_image_tokens (8 , 1024 );
2711- hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27122708 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2709+ get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge , false );
27132710 } break ;
27142711 case PROJECTOR_TYPE_LFM2:
27152712 {
2716- hparams.set_limit_image_tokens (8 , 256 );
2713+ // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
2714+ hparams.set_limit_image_tokens (64 , 256 );
27172715 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
27182716 } break ;
27192717 case PROJECTOR_TYPE_PIXTRAL:
27202718 case PROJECTOR_TYPE_LIGHTONOCR:
27212719 {
2720+ // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
2721+ // TODO: verify the image_min_tokens
27222722 hparams.rope_theta = 10000 .0f ;
27232723 int spatial_merge = 2 ;
27242724 get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
@@ -2730,6 +2730,7 @@ struct clip_model_loader {
27302730 {
27312731 hparams.rope_theta = 10000 .0f ;
27322732 get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
2733+ // TODO: check kimivl preprocessor for exact values
27332734 hparams.set_limit_image_tokens (8 , 1024 );
27342735 hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27352736 } break ;
@@ -2749,7 +2750,11 @@ struct clip_model_loader {
27492750 get_u32 (KEY_SPATIAL_MERGE_SIZE, spatial_merge, false );
27502751 hparams.proj_scale_factor = spatial_merge * spatial_merge;
27512752 get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern , model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2752- hparams.set_limit_image_tokens (8 , 1024 );
2753+ // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2754+ // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
2755+ // but we set a lower value to avoid OOM
2756+ // TODO: make it configurable by user
2757+ hparams.set_limit_image_tokens (1 , 2048 );
27532758 hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
27542759 } break ;
27552760 case PROJECTOR_TYPE_LLAMA4:
@@ -2791,7 +2796,13 @@ struct clip_model_loader {
27912796 LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
27922797 LOG_INF (" %s: n_wa_pattern: %d\n " , __func__, hparams.n_wa_pattern );
27932798 if (hparams.proj_scale_factor > 0 ) {
2794- LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
2799+ LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
2800+ }
2801+ if (hparams.image_min_pixels > 0 ) {
2802+ LOG_INF (" %s: image_min_pixels: %d\n " , __func__, hparams.image_min_pixels );
2803+ }
2804+ if (hparams.image_max_pixels > 0 ) {
2805+ LOG_INF (" %s: image_max_pixels: %d\n " , __func__, hparams.image_max_pixels );
27952806 }
27962807 } else if (is_audio) {
27972808 LOG_INF (" \n --- audio hparams ---\n " );
@@ -3467,11 +3478,7 @@ struct img_tool {
34673478 }
34683479
34693480 // fill dst with pad_color
3470- for (size_t i = 0 ; i < dst.buf .size (); i += 3 ) {
3471- dst.buf [i] = pad_color[0 ];
3472- dst.buf [i + 1 ] = pad_color[1 ];
3473- dst.buf [i + 2 ] = pad_color[2 ];
3474- }
3481+ fill (dst, pad_color);
34753482
34763483 int offset_x = 0 ;
34773484 int offset_y = 0 ;
@@ -3483,7 +3490,7 @@ struct img_tool {
34833490 offset_y = target_resolution.height - new_height;
34843491 }
34853492
3486- draw_into (dst, resized_image, offset_x, offset_y);
3493+ composite (dst, resized_image, offset_x, offset_y);
34873494 }
34883495 }
34893496
@@ -3507,7 +3514,8 @@ struct img_tool {
35073514 // the calculated size will be aligned to the nearest multiple of align_size
35083515 // if H or W size is larger than longest_edge, it will be resized to longest_edge
35093516 static clip_image_size calc_size_preserved_ratio (const clip_image_size & inp_size, const int align_size, const int longest_edge) {
3510- if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0 ) {
3517+ GGML_ASSERT (align_size > 0 );
3518+ if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0 ) {
35113519 return {0 , 0 };
35123520 }
35133521
@@ -3527,6 +3535,7 @@ struct img_tool {
35273535 // the calculated size will have min_pixels <= W*H <= max_pixels
35283536 // this is referred as "smart_resize" in transformers code
35293537 static clip_image_size calc_size_preserved_ratio (const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3538+ GGML_ASSERT (align_size > 0 );
35303539 const int width = inp_size.width ;
35313540 const int height = inp_size.height ;
35323541
@@ -3550,9 +3559,8 @@ struct img_tool {
35503559 return {w_bar, h_bar};
35513560 }
35523561
3553- private:
35543562 // draw src image into dst image at offset (offset_x, offset_y)
3555- static void draw_into (clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
3563+ static void composite (clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
35563564 for (int y = 0 ; y < src.ny ; ++y) {
35573565 for (int x = 0 ; x < src.nx ; ++x) {
35583566 for (int c = 0 ; c < 3 ; ++c) {
@@ -3563,6 +3571,16 @@ struct img_tool {
35633571 }
35643572 }
35653573
3574+ // fill the image with a solid color
3575+ static void fill (clip_image_u8 & img, const std::array<uint8_t , 3 > & color) {
3576+ for (size_t i = 0 ; i < img.buf .size (); i += 3 ) {
3577+ img.buf [i] = color[0 ];
3578+ img.buf [i + 1 ] = color[1 ];
3579+ img.buf [i + 2 ] = color[2 ];
3580+ }
3581+ }
3582+
3583+ private:
35663584 // Bilinear resize function
35673585 static void resize_bilinear (const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
35683586 dst.nx = target_width;
@@ -3998,14 +4016,40 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
39984016 case PROJECTOR_TYPE_QWEN25VL:
39994017 case PROJECTOR_TYPE_QWEN3VL:
40004018 {
4001- clip_image_u8 resized;
4002- auto patch_size = params.patch_size * 2 ;
4003- auto new_size = img_tool::calc_size_preserved_ratio (original_size, patch_size, params.image_size );
4004- img_tool::resize (*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR);
4019+ // step 1: make a blank canvas which aligns with grid
4020+ clip_image_u8 canvas;
4021+ const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio (
4022+ original_size,
4023+ params.patch_size * static_cast <int >(std::sqrt (params.proj_scale_factor )),
4024+ params.image_min_pixels ,
4025+ params.image_max_pixels );
4026+ canvas.nx = canvas_size.width ;
4027+ canvas.ny = canvas_size.height ;
4028+ canvas.buf .resize (3 * canvas.nx * canvas.ny );
4029+ img_tool::fill (canvas, {0 , 0 , 0 });
4030+
4031+ // step 2: resize original image to fit into the canvas
4032+ const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio (
4033+ original_size,
4034+ 1 , // avoid distorting which causes bbox misalignment
4035+ params.image_min_pixels ,
4036+ params.image_max_pixels );
4037+
4038+ if (scaled_size.height != original_size.height ||
4039+ scaled_size.width != original_size.width ) {
4040+ clip_image_u8 resized;
4041+ img_tool::resize (*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
4042+ // step 3: composite resized image onto the canvas, top-left corner
4043+ img_tool::composite (canvas, resized, 0 , 0 );
4044+ } else {
4045+ // no resizing needed
4046+ // step 3: composite original image onto the canvas, top-left corner
4047+ img_tool::composite (canvas, *img, 0 , 0 );
4048+ }
40054049
40064050 clip_image_f32_ptr img_f32 (clip_image_f32_init ());
40074051 // clip_image_f32_ptr res(clip_image_f32_init());
4008- normalize_image_u8_to_f32 (resized , *img_f32, params.image_mean , params.image_std );
4052+ normalize_image_u8_to_f32 (canvas , *img_f32, params.image_mean , params.image_std );
40094053 // res_imgs->data[0] = *res;
40104054 res_imgs->entries .push_back (std::move (img_f32));
40114055 } break ;
@@ -4076,8 +4120,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40764120 case PROJECTOR_TYPE_LIGHTONOCR:
40774121 {
40784122 clip_image_u8 resized_image;
4079- auto new_size = img_tool::calc_size_preserved_ratio (original_size, params.patch_size , params.image_size );
4080- img_tool::resize (*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR);
4123+ const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
4124+ original_size,
4125+ params.patch_size * static_cast <int >(std::sqrt (params.proj_scale_factor )),
4126+ params.image_min_pixels ,
4127+ params.image_max_pixels );
4128+ img_tool::resize (*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
40814129 clip_image_f32_ptr img_f32 (clip_image_f32_init ());
40824130 normalize_image_u8_to_f32 (resized_image, *img_f32, params.image_mean , params.image_std );
40834131 res_imgs->entries .push_back (std::move (img_f32));
@@ -4104,7 +4152,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41044152 {
41054153 const clip_image_size target_size = img_tool::calc_size_preserved_ratio (
41064154 original_size,
4107- params.patch_size * params.proj_scale_factor ,
4155+ params.patch_size * static_cast < int >( std::sqrt ( params.proj_scale_factor )) ,
41084156 params.image_min_pixels ,
41094157 params.image_max_pixels );
41104158 const std::array<uint8_t , 3 > pad_color = {122 , 116 , 104 };
0 commit comments