Skip to content

Commit 64cef62

Browse files
committed
feat: Fully working image preprocessing for idefics3 w/ resize and slicing
Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent e172313 commit 64cef62

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

tools/mtmd/clip.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3552,24 +3552,30 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35523552
res_imgs->entries.push_back(std::move(img_f32));
35533553
return true;
35543554
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
3555-
// Do an aspect-ratio preserving resize to the max size
3556-
// TODO: Integrate into llava_uhd to avoid copy-paste
3555+
// The refined size has two steps:
3556+
// 1. Resize w/ aspect-ratio preserving such that the longer side is
3557+
// image_size * scale_factor
3558+
// 2. Resize w/out preserving aspect ratio such that both sides are
3559+
// multiples of image_size (always rounding up)
3560+
//
3561+
// CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
35573562
const int32_t target_size = params.image_size * params.proj_scale_factor;
35583563
const float scale = std::min(
35593564
static_cast<float>(target_size) / original_size.width,
35603565
static_cast<float>(target_size) / original_size.height);
3561-
const clip_image_size refined_size{
3562-
static_cast<int>(original_size.width * scale),
3563-
static_cast<int>(original_size.height * scale),
3564-
};
3566+
int refined_w = static_cast<int>(original_size.width * scale);
3567+
int refined_h = static_cast<int>(original_size.height * scale);
3568+
refined_w = static_cast<int>(params.image_size * std::ceil(static_cast<float>(refined_w) / params.image_size));
3569+
refined_h = static_cast<int>(params.image_size * std::ceil(static_cast<float>(refined_h) / params.image_size));
3570+
const clip_image_size refined_size{refined_w, refined_h};
3571+
35653572
llava_uhd::slice_instructions instructions;
35663573
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
35673574
instructions.refined_size = refined_size;
35683575
instructions.grid_size = clip_image_size{
35693576
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
35703577
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
35713578
};
3572-
instructions.padding_refined = true;
35733579
for (int y = 0; y < refined_size.height; y += params.image_size) {
35743580
for (int x = 0; x < refined_size.width; x += params.image_size) {
35753581
instructions.slices.push_back(llava_uhd::slice_coordinates{

0 commit comments

Comments
 (0)