@@ -170,7 +170,9 @@ struct clip_hparams {
170
170
int32_t projection_dim;
171
171
int32_t n_head;
172
172
int32_t n_layer;
173
- int32_t proj_scale_factor = 0 ; // idefics3
173
+ // idefics3
174
+ int32_t preproc_image_size = 0 ;
175
+ int32_t proj_scale_factor = 0 ;
174
176
175
177
float image_mean[3 ];
176
178
float image_std[3 ];
@@ -2250,6 +2252,7 @@ struct clip_model_loader {
2250
2252
2251
2253
if (is_vision) {
2252
2254
get_u32 (KEY_IMAGE_SIZE, hparams.image_size );
2255
+ get_u32 (KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size , false );
2253
2256
get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
2254
2257
get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
2255
2258
get_i32 (KEY_MINICPMV_VERSION, hparams.minicpmv_version , false ); // legacy
@@ -3554,15 +3557,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3554
3557
} else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3) {
3555
3558
// The refined size has two steps:
3556
3559
// 1. Resize w/ aspect-ratio preserving such that the longer side is
3557
- // image_size * scale_factor
3560
+ // the preprocessor longest size
3558
3561
// 2. Resize w/out preserving aspect ratio such that both sides are
3559
3562
// multiples of image_size (always rounding up)
3560
3563
//
3561
3564
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3562
- const int32_t target_size = params.image_size * params.proj_scale_factor ;
3563
3565
const float scale = std::min (
3564
- static_cast <float >(target_size ) / original_size.width ,
3565
- static_cast <float >(target_size ) / original_size.height );
3566
+ static_cast <float >(params. preproc_image_size ) / original_size.width ,
3567
+ static_cast <float >(params. preproc_image_size ) / original_size.height );
3566
3568
int refined_w = static_cast <int >(original_size.width * scale);
3567
3569
int refined_h = static_cast <int >(original_size.height * scale);
3568
3570
refined_w = static_cast <int >(params.image_size * std::ceil (static_cast <float >(refined_w) / params.image_size ));
0 commit comments