Skip to content

Commit 8819c96

Browse files
committed
feat: Add tiling support for idefices3 in clip.cpp
This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 0aef5e9 commit 8819c96

File tree

1 file changed

+45
-3
lines changed

1 file changed

+45
-3
lines changed

tools/mtmd/clip.cpp

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3551,10 +3551,52 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
35513551
// res_imgs->data[0] = *res;
35523552
res_imgs->entries.push_back(std::move(img_f32));
35533553
return true;
3554-
}
3555-
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
3554+
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
3555+
// Do an aspect-ratio preserving resize to the max size
3556+
// TODO: Integrate into llava_uhd to avoid copy-paste
3557+
const int32_t target_size = params.image_size * params.proj_scale_factor;
3558+
const float scale = std::min(
3559+
static_cast<float>(target_size) / original_size.width,
3560+
static_cast<float>(target_size) / original_size.height);
3561+
const clip_image_size refined_size{
3562+
static_cast<int>(original_size.width * scale),
3563+
static_cast<int>(original_size.height * scale),
3564+
};
3565+
llava_uhd::slice_instructions instructions;
3566+
instructions.overview_size = clip_image_size{params.image_size, params.image_size};
3567+
instructions.refined_size = refined_size;
3568+
instructions.grid_size = clip_image_size{
3569+
static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
3570+
static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
3571+
};
3572+
instructions.padding_refined = true;
3573+
for (int y = 0; y < refined_size.height; y += params.image_size) {
3574+
for (int x = 0; x < refined_size.width; x += params.image_size) {
3575+
instructions.slices.push_back(llava_uhd::slice_coordinates{
3576+
/* x */x,
3577+
/* y */y,
3578+
/* size */clip_image_size{
3579+
std::min(params.image_size, refined_size.width - x),
3580+
std::min(params.image_size, refined_size.height - y)
3581+
}
3582+
});
3583+
}
3584+
}
3585+
auto imgs = llava_uhd::slice_image(img, instructions);
3586+
3587+
// cast and normalize to f32
3588+
for (size_t i = 0; i < imgs.size(); ++i) {
3589+
// clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3590+
clip_image_f32_ptr res(clip_image_f32_init());
3591+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3592+
res_imgs->entries.push_back(std::move(res));
3593+
}
3594+
3595+
res_imgs->grid_x = instructions.grid_size.width;
3596+
res_imgs->grid_y = instructions.grid_size.height;
3597+
return true;
3598+
} else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
35563599
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
3557-
|| ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
35583600
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
35593601
) {
35603602
clip_image_u8 resized_image;

0 commit comments

Comments
 (0)