Skip to content

Commit bae84d4

Browse files
committed
use simple resize for qwen
1 parent 4621d99 commit bae84d4

File tree

1 file changed

+20
-51
lines changed

1 file changed

+20
-51
lines changed

tools/mtmd/clip.cpp

Lines changed: 20 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3426,24 +3426,18 @@ struct img_tool {
34263426
// RESIZE_ALGO_LANCZOS, // TODO
34273427
};
34283428

3429-
enum resize_pad {
3430-
RESIZE_PAD_NONE,
3431-
RESIZE_PAD_AROUND,
3432-
RESIZE_PAD_BOTTOM_RIGHT,
3433-
};
3434-
34353429
static void resize(
34363430
const clip_image_u8 & src,
34373431
clip_image_u8 & dst,
34383432
const clip_image_size & target_resolution,
34393433
resize_algo algo,
3440-
resize_pad pad_mode = RESIZE_PAD_AROUND,
3434+
bool add_padding = true, // TODO: define the behavior for add_padding = false
34413435
std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
34423436
dst.nx = target_resolution.width;
34433437
dst.ny = target_resolution.height;
34443438
dst.buf.resize(3 * dst.nx * dst.ny);
34453439

3446-
if (pad_mode == RESIZE_PAD_NONE) {
3440+
if (!add_padding) {
34473441
// direct resize
34483442
switch (algo) {
34493443
case RESIZE_ALGO_BILINEAR:
@@ -3478,15 +3472,8 @@ struct img_tool {
34783472
// fill dst with pad_color
34793473
fill(dst, pad_color);
34803474

3481-
int offset_x = 0;
3482-
int offset_y = 0;
3483-
if (pad_mode == RESIZE_PAD_AROUND) {
3484-
offset_x = (target_resolution.width - new_width) / 2;
3485-
offset_y = (target_resolution.height - new_height) / 2;
3486-
} else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) {
3487-
offset_x = target_resolution.width - new_width;
3488-
offset_y = target_resolution.height - new_height;
3489-
}
3475+
int offset_x = (target_resolution.width - new_width) / 2;
3476+
int offset_y = (target_resolution.height - new_height) / 2;
34903477

34913478
composite(dst, resized_image, offset_x, offset_y);
34923479
}
@@ -3523,8 +3510,9 @@ struct img_tool {
35233510
float target_width_f = static_cast<float>(inp_size.width) * scale;
35243511
float target_height_f = static_cast<float>(inp_size.height) * scale;
35253512

3526-
int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
3527-
int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
3513+
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
3514+
int aligned_width = ceil_by_factor(target_width_f);
3515+
int aligned_height = ceil_by_factor(target_height_f);
35283516

35293517
return {aligned_width, aligned_height};
35303518
}
@@ -3852,7 +3840,7 @@ struct llava_uhd {
38523840
} else {
38533841
// only algo bicubic preserves the ratio; old models rely on this behavior
38543842
// TODO: do we need to support other algos here?
3855-
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE);
3843+
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
38563844
}
38573845

38583846
// create slices
@@ -4022,35 +4010,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40224010
case PROJECTOR_TYPE_QWEN3VL:
40234011
{
40244012
// step 1: make a blank canvas which aligns to the grid
4025-
clip_image_u8 canvas;
4026-
const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
4013+
clip_image_u8 resized;
4014+
const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
40274015
original_size,
4028-
params.patch_size * params.n_merge,
4016+
params.patch_size * 2,
40294017
params.image_min_pixels,
40304018
params.image_max_pixels);
4031-
canvas.nx = canvas_size.width;
4032-
canvas.ny = canvas_size.height;
4033-
canvas.buf.resize(3 * canvas.nx * canvas.ny);
4034-
img_tool::fill(canvas, {0, 0, 0});
4035-
4036-
// step 2: composite resized image onto the canvas, top-left corner
4037-
if (original_size.height > canvas.ny || original_size.width > canvas.nx) {
4038-
// need to resize original image first
4039-
clip_image_u8 resized;
4040-
const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
4041-
original_size,
4042-
1, // no need to align here since we will composite onto canvas
4043-
std::min(canvas.nx, canvas.ny)); // fit into the canvas
4044-
img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
4045-
img_tool::composite(canvas, resized, 0, 0);
4046-
} else {
4047-
// no resizing needed
4048-
img_tool::composite(canvas, *img, 0, 0);
4049-
}
4050-
4019+
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
4020+
// clip_image_save_to_bmp(canvas, "preproc.bmp");
40514021
clip_image_f32_ptr img_f32(clip_image_f32_init());
40524022
// clip_image_f32_ptr res(clip_image_f32_init());
4053-
normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std);
4023+
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
40544024
// res_imgs->data[0] = *res;
40554025
res_imgs->entries.push_back(std::move(img_f32));
40564026
} break;
@@ -4163,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41634133
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
41644134

41654135
clip_image_u8 resized_img;
4166-
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
4136+
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
41674137
clip_image_f32_ptr res(clip_image_f32_init());
41684138
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
41694139
res_imgs->entries.push_back(std::move(res));
@@ -4195,7 +4165,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41954165
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
41964166

41974167
// resize the image to the target_size
4198-
img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
4168+
img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
41994169

42004170
clip_image_f32_ptr res(clip_image_f32_init());
42014171
normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
@@ -4268,15 +4238,15 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
42684238
const auto & params = ctx->model.hparams;
42694239
const int n_total = clip_n_output_tokens(ctx, img);
42704240
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4271-
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
4241+
return img->nx / (params.patch_size * 2);
42724242
}
42734243
return n_total;
42744244
}
42754245

42764246
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
42774247
const auto & params = ctx->model.hparams;
42784248
if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
4279-
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
4249+
return img->ny / (params.patch_size * 2);
42804250
}
42814251
return 1;
42824252
}
@@ -4334,9 +4304,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
43344304
case PROJECTOR_TYPE_QWEN3VL:
43354305
{
43364306
// dynamic size (2 conv, so double patch size)
4337-
int patch_size = params.patch_size * 2;
4338-
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
4339-
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
4307+
int x_patch = img->nx / (params.patch_size * 2);
4308+
int y_patch = img->ny / (params.patch_size * 2);
43404309
n_patches = x_patch * y_patch;
43414310
} break;
43424311
case PROJECTOR_TYPE_GEMMA3:

0 commit comments

Comments
 (0)