Skip to content

Commit f5a7f4d

Browse files
committed
fix: Use the longest side instead of size * scale_factor
For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent e1ba793 commit f5a7f4d

File tree

2 files changed

+8
-5
lines changed

2 files changed

+8
-5
lines changed

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
// vision-specific
3333
#define KEY_IMAGE_SIZE "clip.vision.image_size"
34+
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
3435
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3536
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
3637
#define KEY_IMAGE_STD "clip.vision.image_std"

tools/mtmd/clip.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,9 @@ struct clip_hparams {
170170
int32_t projection_dim;
171171
int32_t n_head;
172172
int32_t n_layer;
173-
int32_t proj_scale_factor = 0; // idefics3
173+
// idefics3
174+
int32_t preproc_image_size = 0;
175+
int32_t proj_scale_factor = 0;
174176

175177
float image_mean[3];
176178
float image_std[3];
@@ -2250,6 +2252,7 @@ struct clip_model_loader {
22502252

22512253
if (is_vision) {
22522254
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2255+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
22532256
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
22542257
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
22552258
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -3554,15 +3557,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35543557
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
35553558
// The refined size has two steps:
35563559
// 1. Resize w/ aspect-ratio preserving such that the longer side is
3557-
// image_size * scale_factor
3560+
// the preprocessor longest size
35583561
// 2. Resize w/out preserving aspect ratio such that both sides are
35593562
// multiples of image_size (always rounding up)
35603563
//
35613564
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3562-
const int32_t target_size = params.image_size * params.proj_scale_factor;
35633565
const float scale = std::min(
3564-
static_cast<float>(target_size) / original_size.width,
3565-
static_cast<float>(target_size) / original_size.height);
3566+
static_cast<float>(params.preproc_image_size) / original_size.width,
3567+
static_cast<float>(params.preproc_image_size) / original_size.height);
35663568
int refined_w = static_cast<int>(original_size.width * scale);
35673569
int refined_h = static_cast<int>(original_size.height * scale);
35683570
refined_w = static_cast<int>(params.image_size * std::ceil(static_cast<float>(refined_w) / params.image_size));

0 commit comments

Comments
 (0)