@@ -3552,24 +3552,30 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35523552 res_imgs->entries .push_back (std::move (img_f32));
35533553 return true ;
35543554 } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
3555- // Do an aspect-ratio preserving resize to the max size
3556- // TODO: Integrate into llava_uhd to avoid copy-paste
3555+ // The refined size has two steps:
3556+ // 1. Resize w/ aspect-ratio preserving such that the longer side is
3557+ // image_size * scale_factor
3558+ // 2. Resize w/out preserving aspect ratio such that both sides are
3559+ // multiples of image_size (always rounding up)
3560+ //
3561+ // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
35573562 const int32_t target_size = params.image_size * params.proj_scale_factor ;
35583563 const float scale = std::min (
35593564 static_cast <float >(target_size) / original_size.width ,
35603565 static_cast <float >(target_size) / original_size.height );
3561- const clip_image_size refined_size{
3562- static_cast <int >(original_size.width * scale),
3563- static_cast <int >(original_size.height * scale),
3564- };
3566+ int refined_w = static_cast <int >(original_size.width * scale);
3567+ int refined_h = static_cast <int >(original_size.height * scale);
3568+ refined_w = static_cast <int >(params.image_size * std::ceil (static_cast <float >(refined_w) / params.image_size ));
3569+ refined_h = static_cast <int >(params.image_size * std::ceil (static_cast <float >(refined_h) / params.image_size ));
3570+ const clip_image_size refined_size{refined_w, refined_h};
3571+
35653572 llava_uhd::slice_instructions instructions;
35663573 instructions.overview_size = clip_image_size{params.image_size , params.image_size };
35673574 instructions.refined_size = refined_size;
35683575 instructions.grid_size = clip_image_size{
35693576 static_cast <int >(std::ceil (static_cast <float >(refined_size.width ) / params.image_size )),
35703577 static_cast <int >(std::ceil (static_cast <float >(refined_size.height ) / params.image_size )),
35713578 };
3572- instructions.padding_refined = true ;
35733579 for (int y = 0 ; y < refined_size.height ; y += params.image_size ) {
35743580 for (int x = 0 ; x < refined_size.width ; x += params.image_size ) {
35753581 instructions.slices .push_back (llava_uhd::slice_coordinates{
0 commit comments