@@ -3551,10 +3551,52 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35513551 // res_imgs->data[0] = *res;
35523552 res_imgs->entries .push_back (std::move (img_f32));
35533553 return true ;
3554- }
3555- else if (ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE
3554+ } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
3555+ // Do an aspect-ratio preserving resize to the max size
3556+ // TODO: Integrate into llava_uhd to avoid copy-paste
3557+ const int32_t target_size = params.image_size * params.proj_scale_factor ;
3558+ const float scale = std::min (
3559+ static_cast <float >(target_size) / original_size.width ,
3560+ static_cast <float >(target_size) / original_size.height );
3561+ const clip_image_size refined_size{
3562+ static_cast <int >(original_size.width * scale),
3563+ static_cast <int >(original_size.height * scale),
3564+ };
3565+ llava_uhd::slice_instructions instructions;
3566+ instructions.overview_size = clip_image_size{params.image_size , params.image_size };
3567+ instructions.refined_size = refined_size;
3568+ instructions.grid_size = clip_image_size{
3569+ static_cast <int >(std::ceil (static_cast <float >(refined_size.width ) / params.image_size )),
3570+ static_cast <int >(std::ceil (static_cast <float >(refined_size.height ) / params.image_size )),
3571+ };
3572+ instructions.padding_refined = true ;
3573+ for (int y = 0 ; y < refined_size.height ; y += params.image_size ) {
3574+ for (int x = 0 ; x < refined_size.width ; x += params.image_size ) {
3575+ instructions.slices .push_back (llava_uhd::slice_coordinates{
3576+ /* x */ x,
3577+ /* y */ y,
3578+ /* size */ clip_image_size{
3579+ std::min (params.image_size , refined_size.width - x),
3580+ std::min (params.image_size , refined_size.height - y)
3581+ }
3582+ });
3583+ }
3584+ }
3585+ auto imgs = llava_uhd::slice_image (img, instructions);
3586+
3587+ // cast and normalize to f32
3588+ for (size_t i = 0 ; i < imgs.size (); ++i) {
3589+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3590+ clip_image_f32_ptr res (clip_image_f32_init ());
3591+ normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
3592+ res_imgs->entries .push_back (std::move (res));
3593+ }
3594+
3595+ res_imgs->grid_x = instructions.grid_size .width ;
3596+ res_imgs->grid_y = instructions.grid_size .height ;
3597+ return true ;
3598+ } else if (ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE
35563599 || ctx->proj_type () == PROJECTOR_TYPE_GEMMA3
3557- || ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3
35583600 || ctx->proj_type () == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
35593601 ) {
35603602 clip_image_u8 resized_image;
0 commit comments