Skip to content

Commit 37f5a10

Browse files
authored
mtmd: enhance image resizing in llava_uhd (#18014)
1 parent 9e6649e commit 37f5a10

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

tools/mtmd/clip.cpp

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,7 +2230,14 @@ struct llava_uhd {
22302230
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
22312231
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
22322232
std::vector<slice_coordinates> slices;
2233+
2234+
img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
2235+
bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2236+
std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
2237+
2238+
img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
22332239
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2240+
std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
22342241
};
22352242

22362243
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
@@ -2257,10 +2264,11 @@ struct llava_uhd {
22572264
auto refine_size = llava_uhd::select_best_resolution(
22582265
original_size,
22592266
ctx->model.hparams.image_res_candidates);
2260-
res.overview_size = clip_image_size{slice_size, slice_size};
2261-
res.refined_size = refine_size;
2262-
res.grid_size = clip_image_size{0, 0};
2263-
res.padding_refined = true;
2267+
res.overview_size = clip_image_size{slice_size, slice_size};
2268+
res.refined_size = refine_size;
2269+
res.grid_size = clip_image_size{0, 0};
2270+
res.padding_refined = true;
2271+
res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding
22642272

22652273
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
22662274
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
@@ -2339,26 +2347,22 @@ struct llava_uhd {
23392347

23402348
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
23412349
std::vector<clip_image_u8_ptr> output;
2342-
img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
23432350

23442351
// resize to overview size
23452352
clip_image_u8_ptr resized_img(clip_image_u8_init());
2346-
img_tool::resize(*img, *resized_img, inst.overview_size, interpolation);
2353+
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
2354+
inst.padding_overview, inst.pad_color_overview);
23472355
output.push_back(std::move(resized_img));
2356+
23482357
if (inst.slices.empty()) {
23492358
// no slices, just return the resized image
23502359
return output;
23512360
}
23522361

23532362
// resize to refined size
23542363
clip_image_u8_ptr refined_img(clip_image_u8_init());
2355-
if (inst.padding_refined) {
2356-
img_tool::resize(*img, *refined_img, inst.refined_size, interpolation);
2357-
} else {
2358-
// only algo bicubic preserves the ratio; old models rely on this behavior
2359-
// TODO: do we need to support other algos here?
2360-
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
2361-
}
2364+
img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
2365+
inst.padding_refined, inst.pad_color_refined);
23622366

23632367
// create slices
23642368
for (const auto & slice : inst.slices) {

0 commit comments

Comments
 (0)