@@ -3551,10 +3551,52 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3551
3551
// res_imgs->data[0] = *res;
3552
3552
res_imgs->entries .push_back (std::move (img_f32));
3553
3553
return true ;
3554
- }
3555
- else if (ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE
3554
+ } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
3555
+ // Do an aspect-ratio preserving resize to the max size
3556
+ // TODO: Integrate into llava_uhd to avoid copy-paste
3557
+ const int32_t target_size = params.image_size * params.proj_scale_factor ;
3558
+ const float scale = std::min (
3559
+ static_cast <float >(target_size) / original_size.width ,
3560
+ static_cast <float >(target_size) / original_size.height );
3561
+ const clip_image_size refined_size{
3562
+ static_cast <int >(original_size.width * scale),
3563
+ static_cast <int >(original_size.height * scale),
3564
+ };
3565
+ llava_uhd::slice_instructions instructions;
3566
+ instructions.overview_size = clip_image_size{params.image_size , params.image_size };
3567
+ instructions.refined_size = refined_size;
3568
+ instructions.grid_size = clip_image_size{
3569
+ static_cast <int >(std::ceil (static_cast <float >(refined_size.width ) / params.image_size )),
3570
+ static_cast <int >(std::ceil (static_cast <float >(refined_size.height ) / params.image_size )),
3571
+ };
3572
+ instructions.padding_refined = true ;
3573
+ for (int y = 0 ; y < refined_size.height ; y += params.image_size ) {
3574
+ for (int x = 0 ; x < refined_size.width ; x += params.image_size ) {
3575
+ instructions.slices .push_back (llava_uhd::slice_coordinates{
3576
+ /* x */ x,
3577
+ /* y */ y,
3578
+ /* size */ clip_image_size{
3579
+ std::min (params.image_size , refined_size.width - x),
3580
+ std::min (params.image_size , refined_size.height - y)
3581
+ }
3582
+ });
3583
+ }
3584
+ }
3585
+ auto imgs = llava_uhd::slice_image (img, instructions);
3586
+
3587
+ // cast and normalize to f32
3588
+ for (size_t i = 0 ; i < imgs.size (); ++i) {
3589
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3590
+ clip_image_f32_ptr res (clip_image_f32_init ());
3591
+ normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
3592
+ res_imgs->entries .push_back (std::move (res));
3593
+ }
3594
+
3595
+ res_imgs->grid_x = instructions.grid_size .width ;
3596
+ res_imgs->grid_y = instructions.grid_size .height ;
3597
+ return true ;
3598
+ } else if (ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE
3556
3599
|| ctx->proj_type () == PROJECTOR_TYPE_GEMMA3
3557
- || ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3
3558
3600
|| ctx->proj_type () == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
3559
3601
) {
3560
3602
clip_image_u8 resized_image;
0 commit comments