@@ -3552,24 +3552,30 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3552
3552
res_imgs->entries .push_back (std::move (img_f32));
3553
3553
return true ;
3554
3554
} else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
3555
- // Do an aspect-ratio preserving resize to the max size
3556
- // TODO: Integrate into llava_uhd to avoid copy-paste
3555
+ // The refined size has two steps:
3556
+ // 1. Resize w/ aspect-ratio preserving such that the longer side is
3557
+ // image_size * scale_factor
3558
+ // 2. Resize w/out preserving aspect ratio such that both sides are
3559
+ // multiples of image_size (always rounding up)
3560
+ //
3561
+ // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3557
3562
const int32_t target_size = params.image_size * params.proj_scale_factor ;
3558
3563
const float scale = std::min (
3559
3564
static_cast <float >(target_size) / original_size.width ,
3560
3565
static_cast <float >(target_size) / original_size.height );
3561
- const clip_image_size refined_size{
3562
- static_cast <int >(original_size.width * scale),
3563
- static_cast <int >(original_size.height * scale),
3564
- };
3566
+ int refined_w = static_cast <int >(original_size.width * scale);
3567
+ int refined_h = static_cast <int >(original_size.height * scale);
3568
+ refined_w = static_cast <int >(params.image_size * std::ceil (static_cast <float >(refined_w) / params.image_size ));
3569
+ refined_h = static_cast <int >(params.image_size * std::ceil (static_cast <float >(refined_h) / params.image_size ));
3570
+ const clip_image_size refined_size{refined_w, refined_h};
3571
+
3565
3572
llava_uhd::slice_instructions instructions;
3566
3573
instructions.overview_size = clip_image_size{params.image_size , params.image_size };
3567
3574
instructions.refined_size = refined_size;
3568
3575
instructions.grid_size = clip_image_size{
3569
3576
static_cast <int >(std::ceil (static_cast <float >(refined_size.width ) / params.image_size )),
3570
3577
static_cast <int >(std::ceil (static_cast <float >(refined_size.height ) / params.image_size )),
3571
3578
};
3572
- instructions.padding_refined = true ;
3573
3579
for (int y = 0 ; y < refined_size.height ; y += params.image_size ) {
3574
3580
for (int x = 0 ; x < refined_size.width ; x += params.image_size ) {
3575
3581
instructions.slices .push_back (llava_uhd::slice_coordinates{
0 commit comments