Skip to content

Commit 68b1507

Browse files
committed
better image preproc for qwen
1 parent c53c566 commit 68b1507

File tree

1 file changed

+72
-24
lines changed

1 file changed

+72
-24
lines changed

tools/mtmd/clip.cpp

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ struct clip_hparams {
206206
int minicpmv_version = 0;
207207
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
208208

209-
// used by LFM2 and KIMI-VL
210209
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
211210
const int patch_area = patch_size * patch_size * proj_scale_factor;
212211
image_min_pixels = n_tokens_min * patch_area;
@@ -2592,7 +2591,6 @@ struct clip_model_loader {
25922591

25932592
if (is_vision) {
25942593
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
2595-
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
25962594
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
25972595
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
25982596
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -2707,18 +2705,20 @@ struct clip_model_loader {
27072705
} break;
27082706
case PROJECTOR_TYPE_IDEFICS3:
27092707
{
2710-
hparams.set_limit_image_tokens(8, 1024);
2711-
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27122708
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2709+
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
27132710
} break;
27142711
case PROJECTOR_TYPE_LFM2:
27152712
{
2716-
hparams.set_limit_image_tokens(8, 256);
2713+
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
2714+
hparams.set_limit_image_tokens(64, 256);
27172715
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
27182716
} break;
27192717
case PROJECTOR_TYPE_PIXTRAL:
27202718
case PROJECTOR_TYPE_LIGHTONOCR:
27212719
{
2720+
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
2721+
// TODO: verify the image_min_tokens
27222722
hparams.rope_theta = 10000.0f;
27232723
int spatial_merge = 2;
27242724
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
@@ -2730,6 +2730,7 @@ struct clip_model_loader {
27302730
{
27312731
hparams.rope_theta = 10000.0f;
27322732
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
2733+
// TODO: check kimivl preprocessor for exact values
27332734
hparams.set_limit_image_tokens(8, 1024);
27342735
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27352736
} break;
@@ -2749,7 +2750,11 @@ struct clip_model_loader {
27492750
get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
27502751
hparams.proj_scale_factor = spatial_merge * spatial_merge;
27512752
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
2752-
hparams.set_limit_image_tokens(8, 1024);
2753+
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
2754+
// the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
2755+
// but we set a lower value to avoid OOM
2756+
// TODO: make it configurable by user
2757+
hparams.set_limit_image_tokens(1, 2048);
27532758
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
27542759
} break;
27552760
case PROJECTOR_TYPE_LLAMA4:
@@ -2791,7 +2796,13 @@ struct clip_model_loader {
27912796
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
27922797
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
27932798
if (hparams.proj_scale_factor > 0) {
2794-
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2799+
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
2800+
}
2801+
if (hparams.image_min_pixels > 0) {
2802+
LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels);
2803+
}
2804+
if (hparams.image_max_pixels > 0) {
2805+
LOG_INF("%s: image_max_pixels: %d\n", __func__, hparams.image_max_pixels);
27952806
}
27962807
} else if (is_audio) {
27972808
LOG_INF("\n--- audio hparams ---\n");
@@ -3467,11 +3478,7 @@ struct img_tool {
34673478
}
34683479

34693480
// fill dst with pad_color
3470-
for (size_t i = 0; i < dst.buf.size(); i += 3) {
3471-
dst.buf[i] = pad_color[0];
3472-
dst.buf[i + 1] = pad_color[1];
3473-
dst.buf[i + 2] = pad_color[2];
3474-
}
3481+
fill(dst, pad_color);
34753482

34763483
int offset_x = 0;
34773484
int offset_y = 0;
@@ -3483,7 +3490,7 @@ struct img_tool {
34833490
offset_y = target_resolution.height - new_height;
34843491
}
34853492

3486-
draw_into(dst, resized_image, offset_x, offset_y);
3493+
composite(dst, resized_image, offset_x, offset_y);
34873494
}
34883495
}
34893496

@@ -3507,7 +3514,8 @@ struct img_tool {
35073514
// the calculated size will be aligned to the nearest multiple of align_size
35083515
// if H or W size is larger than longest_edge, it will be resized to longest_edge
35093516
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
3510-
if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) {
3517+
GGML_ASSERT(align_size > 0);
3518+
if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
35113519
return {0, 0};
35123520
}
35133521

@@ -3527,6 +3535,7 @@ struct img_tool {
35273535
// the calculated size will have min_pixels <= W*H <= max_pixels
35283536
// this is referred as "smart_resize" in transformers code
35293537
static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
3538+
GGML_ASSERT(align_size > 0);
35303539
const int width = inp_size.width;
35313540
const int height = inp_size.height;
35323541

@@ -3550,9 +3559,8 @@ struct img_tool {
35503559
return {w_bar, h_bar};
35513560
}
35523561

3553-
private:
35543562
// draw src image into dst image at offset (offset_x, offset_y)
3555-
static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
3563+
static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
35563564
for (int y = 0; y < src.ny; ++y) {
35573565
for (int x = 0; x < src.nx; ++x) {
35583566
for (int c = 0; c < 3; ++c) {
@@ -3563,6 +3571,16 @@ struct img_tool {
35633571
}
35643572
}
35653573

3574+
// fill the image with a solid color
3575+
static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
3576+
for (size_t i = 0; i < img.buf.size(); i += 3) {
3577+
img.buf[i] = color[0];
3578+
img.buf[i + 1] = color[1];
3579+
img.buf[i + 2] = color[2];
3580+
}
3581+
}
3582+
3583+
private:
35663584
// Bilinear resize function
35673585
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
35683586
dst.nx = target_width;
@@ -3998,14 +4016,40 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
39984016
case PROJECTOR_TYPE_QWEN25VL:
39994017
case PROJECTOR_TYPE_QWEN3VL:
40004018
{
4001-
clip_image_u8 resized;
4002-
auto patch_size = params.patch_size * 2;
4003-
auto new_size = img_tool::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
4004-
img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR);
4019+
// step 1: make a blank canvas which aligns with grid
4020+
clip_image_u8 canvas;
4021+
const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
4022+
original_size,
4023+
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
4024+
params.image_min_pixels,
4025+
params.image_max_pixels);
4026+
canvas.nx = canvas_size.width;
4027+
canvas.ny = canvas_size.height;
4028+
canvas.buf.resize(3 * canvas.nx * canvas.ny);
4029+
img_tool::fill(canvas, {0, 0, 0});
4030+
4031+
// step 2: resize original image to fit into the canvas
4032+
const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
4033+
original_size,
4034+
1, // avoid distorting which causes bbox misalignment
4035+
params.image_min_pixels,
4036+
params.image_max_pixels);
4037+
4038+
if (scaled_size.height != original_size.height ||
4039+
scaled_size.width != original_size.width) {
4040+
clip_image_u8 resized;
4041+
img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
4042+
// step 3: composite resized image onto the canvas, top-left corner
4043+
img_tool::composite(canvas, resized, 0, 0);
4044+
} else {
4045+
// no resizing needed
4046+
// step 3: composite original image onto the canvas, top-left corner
4047+
img_tool::composite(canvas, *img, 0, 0);
4048+
}
40054049

40064050
clip_image_f32_ptr img_f32(clip_image_f32_init());
40074051
// clip_image_f32_ptr res(clip_image_f32_init());
4008-
normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
4052+
normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std);
40094053
// res_imgs->data[0] = *res;
40104054
res_imgs->entries.push_back(std::move(img_f32));
40114055
} break;
@@ -4076,8 +4120,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
40764120
case PROJECTOR_TYPE_LIGHTONOCR:
40774121
{
40784122
clip_image_u8 resized_image;
4079-
auto new_size = img_tool::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
4080-
img_tool::resize(*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR);
4123+
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
4124+
original_size,
4125+
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
4126+
params.image_min_pixels,
4127+
params.image_max_pixels);
4128+
img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
40814129
clip_image_f32_ptr img_f32(clip_image_f32_init());
40824130
normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
40834131
res_imgs->entries.push_back(std::move(img_f32));
@@ -4104,7 +4152,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41044152
{
41054153
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
41064154
original_size,
4107-
params.patch_size * params.proj_scale_factor,
4155+
params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
41084156
params.image_min_pixels,
41094157
params.image_max_pixels);
41104158
const std::array<uint8_t, 3> pad_color = {122, 116, 104};

0 commit comments

Comments
 (0)