@@ -171,7 +171,7 @@ struct clip_hparams {
171171 int32_t n_head;
172172 int32_t n_layer;
173173 // idefics3
174- int32_t preproc_image_size = 0 ;
174+ int32_t preproc_image_size = 0 ; // aka max_dimension
175175 int32_t proj_scale_factor = 0 ;
176176
177177 float image_mean[3 ];
@@ -3480,8 +3480,8 @@ struct image_manipulation {
34803480 return {0 , 0 };
34813481 }
34823482
3483- float scale = std::min (1 . 0f , std::min ( static_cast <float >(max_dimension) / inp_size.width ,
3484- static_cast <float >(max_dimension) / inp_size.height ) );
3483+ float scale = std::min (static_cast <float >(max_dimension) / inp_size.width ,
3484+ static_cast <float >(max_dimension) / inp_size.height );
34853485
34863486 float target_width_f = static_cast <float >(inp_size.width ) * scale;
34873487 float target_height_f = static_cast <float >(inp_size.height ) * scale;
@@ -3644,7 +3644,7 @@ struct llava_uhd {
36443644
36453645 // resize to overview size
36463646 clip_image_u8_ptr resized_img (clip_image_u8_init ());
3647- image_manipulation::bicubic_resize (*img, *resized_img, inst.overview_size . width , inst. overview_size . height );
3647+ image_manipulation::resize_and_pad_image (*img, *resized_img, inst.overview_size );
36483648 output.push_back (std::move (resized_img));
36493649 if (inst.slices .empty ()) {
36503650 // no slices, just return the resized image
@@ -3846,6 +3846,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
38463846 // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
38473847 const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio (
38483848 original_size, params.image_size , params.preproc_image_size );
3849+ // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
3850+ // __func__, original_size.width, original_size.height,
3851+ // refined_size.width, refined_size.height);
38493852
38503853 llava_uhd::slice_instructions instructions;
38513854 instructions.overview_size = clip_image_size{params.image_size , params.image_size };
@@ -3856,6 +3859,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
38563859 };
38573860 for (int y = 0 ; y < refined_size.height ; y += params.image_size ) {
38583861 for (int x = 0 ; x < refined_size.width ; x += params.image_size ) {
3862+ // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
38593863 instructions.slices .push_back (llava_uhd::slice_coordinates{
38603864 /* x */ x,
38613865 /* y */ y,
0 commit comments