From c1b189ae33ee19c5c90b0aefaa6ceff57471ec3e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 00:33:31 +0100 Subject: [PATCH 01/16] mtmd: refactor preprocessing + support max/min pixels --- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip.cpp | 611 ++++++++++++++++++++++------------------- 2 files changed, 335 insertions(+), 278 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 311a4c9086a53..c7e9498349c1b 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -154,8 +154,8 @@ enum projector_type { PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_LIGHTONOCR, - PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_COGVLM, + PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b312fda637f3b..79ddd6126f325 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3399,9 +3399,134 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 // set of tools to manupulate images // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv -struct image_manipulation { +struct img_tool { + enum resize_algo { + RESIZE_ALGO_BILINEAR, + RESIZE_ALGO_BICUBIC, + // RESIZE_ALGO_LANCZOS, // TODO + }; + + enum resize_pad { + RESIZE_PAD_NONE, + RESIZE_PAD_AROUND, + RESIZE_PAD_BOTTOM_RIGHT, + }; + + static void resize( + const clip_image_u8 & src, + clip_image_u8 & dst, + const clip_image_size & target_resolution, + resize_algo algo, + resize_pad pad_mode = RESIZE_PAD_AROUND, + std::array pad_color = {0, 0, 0}) { + dst.nx = target_resolution.width; + dst.ny = target_resolution.height; + dst.buf.resize(3 * dst.nx * dst.ny); + + if (pad_mode == RESIZE_PAD_NONE) { + // direct resize + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, dst, target_resolution.width, target_resolution.height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + } else { + // resize with padding + clip_image_u8 resized_image; + float scale_w = static_cast(target_resolution.width) / src.nx; + float scale_h = static_cast(target_resolution.height) / src.ny; + float scale = std::min(scale_w, scale_h); + int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, resized_image, new_width, new_height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + + // fill dst with pad_color + for (size_t i = 0; i < dst.buf.size(); i += 3) { + dst.buf[i] = pad_color[0]; + dst.buf[i + 1] = pad_color[1]; + dst.buf[i + 2] = pad_color[2]; + } + + int offset_x = 0; + int offset_y = 0; + if (pad_mode == RESIZE_PAD_AROUND) { + offset_x = (target_resolution.width - new_width) / 2; + offset_y = (target_resolution.height - new_height) / 2; + } else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) { + offset_x = target_resolution.width - new_width; + offset_y = target_resolution.height - new_height; + } + + draw_into(dst, resized_image, offset_x, offset_y); + } + } + + static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than max_dimension, it will be resized to max_dimension + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { + if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { + return {0, 0}; + } + + float scale = std::min(static_cast(max_dimension) / inp_size.width, + static_cast(max_dimension) / inp_size.height); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + int aligned_width = CLIP_ALIGN((int)target_width_f, align_size); + int aligned_height = CLIP_ALIGN((int)target_height_f, align_size); + + return {aligned_width, aligned_height}; + } + +private: + // draw src image into dst image at offset (offset_x, offset_y) + static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + for (int y = 0; y < src.ny; ++y) { + for (int x = 0; x < src.nx; ++x) { + for (int c = 0; c < 3; ++c) { + dst.buf[3 * ((y + offset_y) * dst.nx + (x + offset_x)) + c] = + src.buf[3 * (y * src.nx + x) + c]; + } + } + } + } + // Bilinear resize function - static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); @@ -3437,7 +3562,7 @@ struct image_manipulation { // Bicubic resize function // part of image will be cropped if the aspect ratio is different - static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { const int nx = img.nx; const int ny = img.ny; @@ -3500,93 +3625,6 @@ struct image_manipulation { return true; } - // llava-1.6 type of resize_and_pad - // if the ratio is not 1:1, padding with pad_color will be applied - // pad_color is single channel, default is 0 (black) - static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array pad_color = {0, 0, 0}) { - int target_width = target_resolution.width; - int target_height = target_resolution.height; - - float scale_w = static_cast(target_width) / image.nx; - float scale_h = static_cast(target_height) / image.ny; - - int new_width, new_height; - - if (scale_w < scale_h) { - new_width = target_width; - new_height = std::min(static_cast(std::ceil(image.ny * scale_w)), target_height); - } else { - new_height = target_height; - new_width = std::min(static_cast(std::ceil(image.nx * scale_h)), target_width); - } - - clip_image_u8 resized_image; - bicubic_resize(image, resized_image, new_width, new_height); - - clip_image_u8 padded_image; - padded_image.nx = target_width; - padded_image.ny = target_height; - padded_image.buf.resize(3 * target_width * target_height); - - // Fill the padded image with the fill color - for (size_t i = 0; i < padded_image.buf.size(); i += 3) { - padded_image.buf[i] = pad_color[0]; - padded_image.buf[i + 1] = pad_color[1]; - padded_image.buf[i + 2] = pad_color[2]; - } - - // Calculate padding offsets - int pad_x = (target_width - new_width) / 2; - int pad_y = (target_height - new_height) / 2; - - // Copy the resized image into the center of the padded buffer - for (int y = 0; y < new_height; ++y) { - for (int x = 0; x < new_width; ++x) { - for (int c = 0; c < 3; ++c) { - padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; - } - } - } - dst = std::move(padded_image); - } - - static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { - dst.nx = w; - dst.ny = h; - dst.buf.resize(3 * w * h); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int src_idx = 3 * ((y + i)*image.nx + (x + j)); - int dst_idx = 3 * (i*w + j); - dst.buf[dst_idx] = image.buf[src_idx]; - dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; - dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; - } - } - } - - // calculate the size of the **resized** image, while preserving the aspect ratio - // the calculated size will be aligned to the nearest multiple of align_size - // if H or W size is larger than max_dimension, it will be resized to max_dimension - static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { - if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { - return {0, 0}; - } - - float scale = std::min(static_cast(max_dimension) / inp_size.width, - static_cast(max_dimension) / inp_size.height); - - float target_width_f = static_cast(inp_size.width) * scale; - float target_height_f = static_cast(inp_size.height) * scale; - - int aligned_width = CLIP_ALIGN((int)target_width_f, align_size); - int aligned_height = CLIP_ALIGN((int)target_height_f, align_size); - - return {aligned_width, aligned_height}; - } - -private: static inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -3735,10 +3773,11 @@ struct llava_uhd { static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { std::vector output; + img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); - image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size); + img_tool::resize(*img, *resized_img, inst.overview_size, interpolation); output.push_back(std::move(resized_img)); if (inst.slices.empty()) { // no slices, just return the resized image @@ -3748,9 +3787,11 @@ struct llava_uhd { // resize to refined size clip_image_u8_ptr refined_img(clip_image_u8_init()); if (inst.padding_refined) { - image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size); + img_tool::resize(*img, *refined_img, inst.refined_size, interpolation); } else { - image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height); + // only algo bicubic preserves the ratio; old models rely on this behavior + // TODO: do we need to support other algos here? + img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE); } // create slices @@ -3761,7 +3802,7 @@ struct llava_uhd { int h = slice.size.height; clip_image_u8_ptr img_slice(clip_image_u8_init()); - image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h); + img_tool::crop(*refined_img, *img_slice, x, y, w, h); output.push_back(std::move(img_slice)); } @@ -3896,208 +3937,224 @@ struct llava_uhd { // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { clip_image_size original_size{img->nx, img->ny}; - bool pad_to_square = true; auto & params = ctx->model.hparams; - // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { - pad_to_square = false; - } - if (clip_is_minicpmv(ctx)) { - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_MINICPMV: + { + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - return true; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + { + clip_image_u8 resized; + auto patch_size = params.patch_size * 2; + auto new_size = img_tool::calc_size_preserved_ratio(original_size, patch_size, params.image_size); + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR); + + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + // res_imgs->data[0] = *res; + res_imgs->entries.push_back(std::move(img_f32)); + } break; - } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - clip_image_u8 resized; - auto patch_size = params.patch_size * 2; - auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size); - image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height); - - clip_image_f32_ptr img_f32(clip_image_f32_init()); - // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); - // res_imgs->data[0] = *res; - res_imgs->entries.push_back(std::move(img_f32)); - return true; - } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { - // The refined size has two steps: - // 1. Resize w/ aspect-ratio preserving such that the longer side is - // the preprocessor longest size - // 2. Resize w/out preserving aspect ratio such that both sides are - // multiples of image_size (always rounding up) - // - // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 - const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio( - original_size, params.image_size, params.preproc_image_size); - // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", - // __func__, original_size.width, original_size.height, - // refined_size.width, refined_size.height); - - llava_uhd::slice_instructions instructions; - instructions.overview_size = clip_image_size{params.image_size, params.image_size}; - instructions.refined_size = refined_size; - instructions.grid_size = clip_image_size{ - static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), - static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), - }; - for (int y = 0; y < refined_size.height; y += params.image_size) { - for (int x = 0; x < refined_size.width; x += params.image_size) { - // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); - instructions.slices.push_back(llava_uhd::slice_coordinates{ - /* x */x, - /* y */y, - /* size */clip_image_size{ - std::min(params.image_size, refined_size.width - x), - std::min(params.image_size, refined_size.height - y) + case PROJECTOR_TYPE_IDEFICS3: + { + // The refined size has two steps: + // 1. Resize w/ aspect-ratio preserving such that the longer side is + // the preprocessor longest size + // 2. Resize w/out preserving aspect ratio such that both sides are + // multiples of image_size (always rounding up) + // + // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 + const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( + original_size, params.image_size, params.preproc_image_size); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); + + llava_uhd::slice_instructions instructions; + instructions.overview_size = clip_image_size{params.image_size, params.image_size}; + instructions.refined_size = refined_size; + instructions.grid_size = clip_image_size{ + static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), + static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), + }; + for (int y = 0; y < refined_size.height; y += params.image_size) { + for (int x = 0; x < refined_size.width; x += params.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); + instructions.slices.push_back(llava_uhd::slice_coordinates{ + /* x */x, + /* y */y, + /* size */clip_image_size{ + std::min(params.image_size, refined_size.width - x), + std::min(params.image_size, refined_size.height - y) + } + }); } - }); - } - } - auto imgs = llava_uhd::slice_image(img, instructions); - - // cast and normalize to f32 - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = instructions.grid_size.width; - res_imgs->grid_y = instructions.grid_size.height; - return true; - } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE - || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3 - || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution - ) { - clip_image_u8 resized_image; - int sz = params.image_size; - image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - //clip_image_save_to_bmp(resized_image, "resized.bmp"); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - return true; - - } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL - || ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR - ) { - clip_image_u8 resized_image; - auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); - image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(img_f32)); - return true; - - } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) { - GGML_ASSERT(!params.image_res_candidates.empty()); - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } - - res_imgs->grid_x = inst.grid_size.width; - res_imgs->grid_y = inst.grid_size.height; - return true; - - } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2 - || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL - ) { - GGML_ASSERT(params.proj_scale_factor); - - // smart resize - const int width = img->nx; - const int height = img->ny; - const int total_factor = params.patch_size * params.proj_scale_factor; - constexpr int min_image_tokens = 64; - constexpr int max_image_tokens = 1024; - const float min_pixels = min_image_tokens * total_factor * total_factor; - const float max_pixels = max_image_tokens * total_factor * total_factor; - - auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; - auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; - auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; - - int h_bar = std::max(total_factor, round_by_factor(height)); - int w_bar = std::max(total_factor, round_by_factor(width)); - - if (h_bar * w_bar > max_pixels) { - const auto beta = std::sqrt((height * width) / max_pixels); - h_bar = std::max(total_factor, floor_by_factor(height / beta)); - w_bar = std::max(total_factor, floor_by_factor(width / beta)); - } else if (h_bar * w_bar < min_pixels) { - const auto beta = std::sqrt(min_pixels / (height * width)); - h_bar = ceil_by_factor(height * beta); - w_bar = ceil_by_factor(width * beta); - } + } + auto imgs = llava_uhd::slice_image(img, instructions); + + // cast and normalize to f32 + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - const std::array pad_color = {122, 116, 104}; + res_imgs->grid_x = instructions.grid_size.width; + res_imgs->grid_y = instructions.grid_size.height; + } break; - clip_image_u8 resized_img; - image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - return true; - } + case PROJECTOR_TYPE_GLM_EDGE: + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + //clip_image_save_to_bmp(resized_image, "resized.bmp"); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; - // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + clip_image_u8 resized_image; + auto new_size = img_tool::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); + img_tool::resize(*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; - clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + case PROJECTOR_TYPE_LLAMA4: + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - if (pad_to_square) { - // for llava-1.5, we resize image to a square, and pad the shorter side with a background color - // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 - const int longer_side = std::max(img->nx, img->ny); - temp->nx = longer_side; - temp->ny = longer_side; - temp->buf.resize(3 * longer_side * longer_side); + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; - // background color in RGB from LLaVA (this is the mean rgb color * 255) - const std::array pad_color = {122, 116, 104}; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + { + GGML_ASSERT(params.proj_scale_factor); + // smart resize + const int width = img->nx; + const int height = img->ny; + const int total_factor = params.patch_size * params.proj_scale_factor; + constexpr int min_image_tokens = 64; + constexpr int max_image_tokens = 1024; + const float min_pixels = min_image_tokens * total_factor * total_factor; + const float max_pixels = max_image_tokens * total_factor * total_factor; + + auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + int h_bar = std::max(total_factor, round_by_factor(height)); + int w_bar = std::max(total_factor, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt((height * width) / max_pixels); + h_bar = std::max(total_factor, floor_by_factor(height / beta)); + w_bar = std::max(total_factor, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(min_pixels / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } - // resize the image to the target_size - image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color); + const std::array pad_color = {122, 116, 104}; - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - return true; + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, clip_image_size{w_bar, h_bar}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; - } else if (!params.image_res_candidates.empty()) { - // "spatial_unpad" with "anyres" processing for llava-1.6 - auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); - std::vector imgs = llava_uhd::slice_image(img, inst); + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm? + { + // TODO @ngxson : refactor the code below to avoid duplicated logic + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { // pad_to_square + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); + temp->nx = longer_side; + temp->ny = longer_side; + temp->buf.resize(3 * longer_side * longer_side); + + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array pad_color = {122, 116, 104}; + + // resize the image to the target_size + img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); + + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + return true; + + } else if (!params.image_res_candidates.empty()) { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); - res_imgs->entries.push_back(std::move(res)); - } + return true; + } + } break; - return true; - } else { - GGML_ABORT("Unknown image preprocessing type"); + default: + LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); + return false; } + return true; } ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { From 13cd2045ab2128af130363df4c5cf15321adc972 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 10:35:04 +0100 Subject: [PATCH 02/16] fix mlp type --- tools/mtmd/clip.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 79ddd6126f325..6e0081aee05c9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4114,7 +4114,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing - if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { // pad_to_square + if (params.image_res_candidates.empty()) { // pad_to_square // for llava-1.5, we resize image to a square, and pad the shorter side with a background color // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 const int longer_side = std::max(img->nx, img->ny); @@ -4131,9 +4131,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); - return true; - } else if (!params.image_res_candidates.empty()) { + } else { // "spatial_unpad" with "anyres" processing for llava-1.6 auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); std::vector imgs = llava_uhd::slice_image(img, inst); @@ -4144,8 +4143,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); } - - return true; } } break; From 66d5c43d444a5a1f3021f05feab1fd6afcdc0f32 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 11:55:28 +0100 Subject: [PATCH 03/16] implement mix/max pixels --- tools/mtmd/clip.cpp | 97 ++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 6e0081aee05c9..09e082541b7bb 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -171,7 +171,9 @@ struct clip_hparams { int32_t n_head; int32_t n_layer; // idefics3 - int32_t preproc_image_size = 0; // aka max_dimension + int32_t image_longest_edge = 0; + int32_t image_min_pixels = 0; + int32_t image_max_pixels = 0; int32_t proj_scale_factor = 0; float image_mean[3]; @@ -204,6 +206,13 @@ struct clip_hparams { bool has_llava_projector = false; int minicpmv_version = 0; int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + // used by LFM2 and KIMI-VL + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int total_factor = patch_size * proj_scale_factor; + image_min_pixels = n_tokens_min * total_factor * total_factor; + image_max_pixels = n_tokens_max * total_factor * total_factor; + } }; struct clip_layer { @@ -2577,7 +2586,7 @@ struct clip_model_loader { if (is_vision) { get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false); + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy @@ -2686,10 +2695,14 @@ struct clip_model_loader { hparams.minicpmv_version = 2; // default to 2 if not set } } break; + case PROJECTOR_TYPE_INTERNVL: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_LFM2: - case PROJECTOR_TYPE_INTERNVL: { + hparams.set_limit_image_tokens(64, 1024); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_PIXTRAL: @@ -2697,15 +2710,14 @@ struct clip_model_loader { { hparams.rope_theta = 10000.0f; hparams.warmup_image_size = hparams.patch_size * 8; - // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM - // ref: https://github.com/ggml-org/llama.cpp/issues/14310 - hparams.image_size = 1024; + hparams.set_limit_image_tokens(64, 1024); get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); } break; case PROJECTOR_TYPE_KIMIVL: { hparams.rope_theta = 10000.0f; hparams.warmup_image_size = hparams.patch_size * 8; + hparams.set_limit_image_tokens(64, 1024); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_GEMMA3: @@ -3494,14 +3506,14 @@ struct img_tool { // calculate the size of the **resized** image, while preserving the aspect ratio // the calculated size will be aligned to the nearest multiple of align_size - // if H or W size is larger than max_dimension, it will be resized to max_dimension - static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) { - if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) { + // if H or W size is larger than longest_edge, it will be resized to longest_edge + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { + if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) { return {0, 0}; } - float scale = std::min(static_cast(max_dimension) / inp_size.width, - static_cast(max_dimension) / inp_size.height); + float scale = std::min(static_cast(longest_edge) / inp_size.width, + static_cast(longest_edge) / inp_size.height); float target_width_f = static_cast(inp_size.width) * scale; float target_height_f = static_cast(inp_size.height) * scale; @@ -3512,6 +3524,33 @@ struct img_tool { return {aligned_width, aligned_height}; } + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will have min_pixels <= W*H <= max_pixels + // this is referred as "smart_resize" in transformers code + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + const int width = inp_size.width; + const int height = inp_size.height; + + auto round_by_factor = [f = align_size](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + int h_bar = std::max(align_size, round_by_factor(height)); + int w_bar = std::max(align_size, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt(static_cast(height * width) / max_pixels); + h_bar = std::max(align_size, floor_by_factor(height / beta)); + w_bar = std::max(align_size, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(static_cast(min_pixels) / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + return {w_bar, h_bar}; + } + private: // draw src image into dst image at offset (offset_x, offset_y) static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { @@ -3982,7 +4021,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( - original_size, params.image_size, params.preproc_image_size); + original_size, params.image_size, params.image_longest_edge); // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", // __func__, original_size.width, original_size.height, // refined_size.width, refined_size.height); @@ -4064,37 +4103,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { - GGML_ASSERT(params.proj_scale_factor); - // smart resize - const int width = img->nx; - const int height = img->ny; - const int total_factor = params.patch_size * params.proj_scale_factor; - constexpr int min_image_tokens = 64; - constexpr int max_image_tokens = 1024; - const float min_pixels = min_image_tokens * total_factor * total_factor; - const float max_pixels = max_image_tokens * total_factor * total_factor; - - auto round_by_factor = [f = total_factor](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; - auto ceil_by_factor = [f = total_factor](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; - auto floor_by_factor = [f = total_factor](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; - - int h_bar = std::max(total_factor, round_by_factor(height)); - int w_bar = std::max(total_factor, round_by_factor(width)); - - if (h_bar * w_bar > max_pixels) { - const auto beta = std::sqrt((height * width) / max_pixels); - h_bar = std::max(total_factor, floor_by_factor(height / beta)); - w_bar = std::max(total_factor, floor_by_factor(width / beta)); - } else if (h_bar * w_bar < min_pixels) { - const auto beta = std::sqrt(min_pixels / (height * width)); - h_bar = ceil_by_factor(height * beta); - w_bar = ceil_by_factor(width * beta); - } - + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.proj_scale_factor, + params.image_min_pixels, + params.image_max_pixels); const std::array pad_color = {122, 116, 104}; clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, clip_image_size{w_bar, h_bar}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); From c53c566319b6394e7ba61f07e06001c9ae642901 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 12:40:22 +0100 Subject: [PATCH 04/16] improve hparams --- tools/mtmd/clip.cpp | 73 ++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 09e082541b7bb..91aab1f72a3ea 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -174,7 +174,7 @@ struct clip_hparams { int32_t image_longest_edge = 0; int32_t image_min_pixels = 0; int32_t image_max_pixels = 0; - int32_t proj_scale_factor = 0; + int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2 float image_mean[3]; float image_std[3]; @@ -196,7 +196,6 @@ struct clip_hparams { std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; - int32_t spatial_merge_size = 0; // audio int32_t n_mel_bins = 0; // whisper preprocessor @@ -209,9 +208,16 @@ struct clip_hparams { // used by LFM2 and KIMI-VL void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { - const int total_factor = patch_size * proj_scale_factor; - image_min_pixels = n_tokens_min * total_factor * total_factor; - image_max_pixels = n_tokens_max * total_factor * total_factor; + const int patch_area = patch_size * patch_size * proj_scale_factor; + image_min_pixels = n_tokens_min * patch_area; + image_max_pixels = n_tokens_max * patch_area; + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + warmup_image_size = n_tok_per_side * patch_size * static_cast(std::sqrt(proj_scale_factor)); } }; @@ -593,7 +599,7 @@ struct clip_graph { } ggml_cgraph * build_pixtral() { - const int n_merge = hparams.spatial_merge_size; + const int n_merge = hparams.proj_scale_factor; // 2D input positions ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); @@ -619,7 +625,7 @@ struct clip_graph { // mistral small 3.1 patch merger // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 if (model.mm_patch_merger_w) { - GGML_ASSERT(hparams.spatial_merge_size > 0); + GGML_ASSERT(hparams.proj_scale_factor > 0); cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); @@ -935,7 +941,7 @@ struct clip_graph { // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl + const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl // loop over layers for (int il = 0; il < n_layer; il++) { @@ -2700,25 +2706,32 @@ struct clip_model_loader { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_IDEFICS3: + { + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; case PROJECTOR_TYPE_LFM2: { - hparams.set_limit_image_tokens(64, 1024); + hparams.set_limit_image_tokens(8, 256); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { hparams.rope_theta = 10000.0f; - hparams.warmup_image_size = hparams.patch_size * 8; - hparams.set_limit_image_tokens(64, 1024); - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); + int spatial_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); + hparams.proj_scale_factor = spatial_merge * spatial_merge; + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_KIMIVL: { hparams.rope_theta = 10000.0f; - hparams.warmup_image_size = hparams.patch_size * 8; - hparams.set_limit_image_tokens(64, 1024); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_GEMMA3: { @@ -2729,29 +2742,15 @@ struct clip_model_loader { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_QWEN2VL: - { - // max image size = sqrt(max_pixels) = 3584 - // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json - // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable - // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 - hparams.image_size = 1024; - hparams.warmup_image_size = hparams.patch_size * 8; - } break; case PROJECTOR_TYPE_QWEN25VL: - { - // max image size = sqrt(max_pixels) - // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json - // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable - // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10 - hparams.image_size = 1024; - hparams.warmup_image_size = hparams.patch_size * 8; - get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern); - } break; case PROJECTOR_TYPE_QWEN3VL: { - hparams.image_size = 1024; // still need this? - hparams.warmup_image_size = hparams.patch_size * 8; - get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); + int spatial_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); + hparams.proj_scale_factor = spatial_merge * spatial_merge; + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_LLAMA4: { @@ -2791,8 +2790,8 @@ struct clip_model_loader { LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); - if (hparams.spatial_merge_size > 0) { - LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size); + if (hparams.proj_scale_factor > 0) { + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); } } else if (is_audio) { LOG_INF("\n--- audio hparams ---\n"); @@ -4310,7 +4309,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LIGHTONOCR: { // dynamic size - int n_merge = params.spatial_merge_size; + int n_merge = params.proj_scale_factor; int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { From 68b1507a6600c5868ab4af3c91f003307815adf0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 14:58:55 +0100 Subject: [PATCH 05/16] better image preproc for qwen --- tools/mtmd/clip.cpp | 96 +++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 24 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 91aab1f72a3ea..0f8338fd6b05a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -206,7 +206,6 @@ struct clip_hparams { int minicpmv_version = 0; int32_t minicpmv_query_num = 0; // MiniCPM-V query number - // used by LFM2 and KIMI-VL void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { const int patch_area = patch_size * patch_size * proj_scale_factor; image_min_pixels = n_tokens_min * patch_area; @@ -2592,7 +2591,6 @@ struct clip_model_loader { if (is_vision) { get_u32(KEY_IMAGE_SIZE, hparams.image_size); - get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy @@ -2707,18 +2705,20 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_IDEFICS3: { - hparams.set_limit_image_tokens(8, 1024); - hparams.set_warmup_n_tokens(256); // avoid OOM on warmup get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); } break; case PROJECTOR_TYPE_LFM2: { - hparams.set_limit_image_tokens(8, 256); + // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json + hparams.set_limit_image_tokens(64, 256); get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { + // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json + // TODO: verify the image_min_tokens hparams.rope_theta = 10000.0f; int spatial_merge = 2; get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); @@ -2730,6 +2730,7 @@ struct clip_model_loader { { hparams.rope_theta = 10000.0f; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + // TODO: check kimivl preprocessor for exact values hparams.set_limit_image_tokens(8, 1024); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; @@ -2749,7 +2750,11 @@ struct clip_model_loader { get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); hparams.proj_scale_factor = spatial_merge * spatial_merge; get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it - hparams.set_limit_image_tokens(8, 1024); + // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json + // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens + // but we set a lower value to avoid OOM + // TODO: make it configurable by user + hparams.set_limit_image_tokens(1, 2048); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_LLAMA4: @@ -2791,7 +2796,13 @@ struct clip_model_loader { LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); if (hparams.proj_scale_factor > 0) { - LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + } + if (hparams.image_min_pixels > 0) { + LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels); + } + if (hparams.image_max_pixels > 0) { + LOG_INF("%s: image_max_pixels: %d\n", __func__, hparams.image_max_pixels); } } else if (is_audio) { LOG_INF("\n--- audio hparams ---\n"); @@ -3467,11 +3478,7 @@ struct img_tool { } // fill dst with pad_color - for (size_t i = 0; i < dst.buf.size(); i += 3) { - dst.buf[i] = pad_color[0]; - dst.buf[i + 1] = pad_color[1]; - dst.buf[i + 2] = pad_color[2]; - } + fill(dst, pad_color); int offset_x = 0; int offset_y = 0; @@ -3483,7 +3490,7 @@ struct img_tool { offset_y = target_resolution.height - new_height; } - draw_into(dst, resized_image, offset_x, offset_y); + composite(dst, resized_image, offset_x, offset_y); } } @@ -3507,7 +3514,8 @@ struct img_tool { // the calculated size will be aligned to the nearest multiple of align_size // if H or W size is larger than longest_edge, it will be resized to longest_edge static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { - if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) { + GGML_ASSERT(align_size > 0); + if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { return {0, 0}; } @@ -3527,6 +3535,7 @@ struct img_tool { // the calculated size will have min_pixels <= W*H <= max_pixels // this is referred as "smart_resize" in transformers code static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + GGML_ASSERT(align_size > 0); const int width = inp_size.width; const int height = inp_size.height; @@ -3550,9 +3559,8 @@ struct img_tool { return {w_bar, h_bar}; } -private: // draw src image into dst image at offset (offset_x, offset_y) - static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { for (int y = 0; y < src.ny; ++y) { for (int x = 0; x < src.nx; ++x) { for (int c = 0; c < 3; ++c) { @@ -3563,6 +3571,16 @@ struct img_tool { } } + // fill the image with a solid color + static void fill(clip_image_u8 & img, const std::array & color) { + for (size_t i = 0; i < img.buf.size(); i += 3) { + img.buf[i] = color[0]; + img.buf[i + 1] = color[1]; + img.buf[i + 2] = color[2]; + } + } + +private: // Bilinear resize function static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { dst.nx = target_width; @@ -3998,14 +4016,40 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: { - clip_image_u8 resized; - auto patch_size = params.patch_size * 2; - auto new_size = img_tool::calc_size_preserved_ratio(original_size, patch_size, params.image_size); - img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR); + // step 1: make a blank canvas which aligns with grid + clip_image_u8 canvas; + const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), + params.image_min_pixels, + params.image_max_pixels); + canvas.nx = canvas_size.width; + canvas.ny = canvas_size.height; + canvas.buf.resize(3 * canvas.nx * canvas.ny); + img_tool::fill(canvas, {0, 0, 0}); + + // step 2: resize original image to fit into the canvas + const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio( + original_size, + 1, // avoid distorting which causes bbox misalignment + params.image_min_pixels, + params.image_max_pixels); + + if (scaled_size.height != original_size.height || + scaled_size.width != original_size.width) { + clip_image_u8 resized; + img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR); + // step 3: composite resized image onto the canvas, top-left corner + img_tool::composite(canvas, resized, 0, 0); + } else { + // no resizing needed + // step 3: composite original image onto the canvas, top-left corner + img_tool::composite(canvas, *img, 0, 0); + } clip_image_f32_ptr img_f32(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std); // res_imgs->data[0] = *res; res_imgs->entries.push_back(std::move(img_f32)); } break; @@ -4076,8 +4120,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_LIGHTONOCR: { clip_image_u8 resized_image; - auto new_size = img_tool::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size); - img_tool::resize(*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); clip_image_f32_ptr img_f32(clip_image_f32_init()); normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(img_f32)); @@ -4104,7 +4152,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str { const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.proj_scale_factor, + params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), params.image_min_pixels, params.image_max_pixels); const std::array pad_color = {122, 116, 104}; From 7bd1a011b4b9dc2901af5fe959c984e8b6a1b30a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 15:18:19 +0100 Subject: [PATCH 06/16] fix --- tools/mtmd/clip.cpp | 47 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0f8338fd6b05a..b18f56fe57701 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -216,7 +216,11 @@ struct clip_hparams { void set_warmup_n_tokens(int n_tokens) { int n_tok_per_side = static_cast(std::sqrt(n_tokens)); GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - warmup_image_size = n_tok_per_side * patch_size * static_cast(std::sqrt(proj_scale_factor)); + warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side(); + } + + int get_scale_factor_per_side() const { + return static_cast(std::sqrt(proj_scale_factor)); } }; @@ -546,7 +550,7 @@ struct clip_graph { const int batch_size = 1; GGML_ASSERT(n_patches_x == n_patches_y); const int patches_per_image = n_patches_x; - const int kernel_size = hparams.proj_scale_factor; + const int kernel_size = hparams.get_scale_factor_per_side(); cur = ggml_transpose(ctx0, cur); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); @@ -568,13 +572,13 @@ struct clip_graph { } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { // pixel_shuffle // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.get_scale_factor_per_side(); cur = build_patch_merge_permute(cur, scale_factor); cur = ggml_mul_mat(ctx0, model.projection, cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.get_scale_factor_per_side(); cur = build_patch_merge_permute(cur, scale_factor); // projection @@ -598,7 +602,7 @@ struct clip_graph { } ggml_cgraph * build_pixtral() { - const int n_merge = hparams.proj_scale_factor; + const int n_merge = hparams.get_scale_factor_per_side(); // 2D input positions ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); @@ -940,7 +944,8 @@ struct clip_graph { // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl + const int merge_factor = hparams.proj_scale_factor > 0 + ? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl // loop over layers for (int il = 0; il < n_layer; il++) { @@ -2366,16 +2371,16 @@ struct clip_graph { // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) // support dynamic resolution - ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { - GGML_ASSERT(scale_factor > 1); + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int kernel_size) { + GGML_ASSERT(kernel_size > 1); const int n_embd = cur->ne[0]; int width = img.nx / patch_size; int height = img.ny / patch_size; // pad width and height to factor - const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; - const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + const int64_t pad_width = CLIP_ALIGN(width, kernel_size) - width; + const int64_t pad_height = CLIP_ALIGN(height, kernel_size) - height; cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); if (pad_width || pad_height) { cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); @@ -2384,11 +2389,11 @@ struct clip_graph { } // unshuffle h - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_reshape_3d(ctx0, cur, n_embd * kernel_size, width / kernel_size, height); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // unshuffle w - cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_cont_3d(ctx0, cur, n_embd * kernel_size * kernel_size, height / kernel_size, width / kernel_size); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); @@ -3203,9 +3208,11 @@ struct clip_model_loader { if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { img->nx = hparams.warmup_image_size; img->ny = hparams.warmup_image_size; + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); } else { img->nx = hparams.warmup_audio_size; img->ny = hparams.n_mel_bins; + LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); } batch.entries.push_back(std::move(img)); @@ -4020,7 +4027,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8 canvas; const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), + params.patch_size * params.get_scale_factor_per_side(), params.image_min_pixels, params.image_max_pixels); canvas.nx = canvas_size.width; @@ -4119,10 +4126,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { + GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); clip_image_u8 resized_image; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), + params.patch_size * params.get_scale_factor_per_side(), params.image_min_pixels, params.image_max_pixels); img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); @@ -4150,9 +4158,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { + GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * static_cast(std::sqrt(params.proj_scale_factor)), + params.patch_size * params.get_scale_factor_per_side(), params.image_min_pixels, params.image_max_pixels); const std::array pad_color = {122, 116, 104}; @@ -4339,15 +4348,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_LLAMA4: { - // both X and Y are downscaled by the scale factor - int scale_factor = ctx->model.hparams.proj_scale_factor; - n_patches /= (scale_factor * scale_factor); + n_patches /= ctx->model.hparams.proj_scale_factor; } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { // dynamic size - int scale_factor = ctx->model.hparams.proj_scale_factor; + int scale_factor = params.get_scale_factor_per_side(); int out_patch_size = params.patch_size * scale_factor; int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; @@ -4357,7 +4364,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LIGHTONOCR: { // dynamic size - int n_merge = params.proj_scale_factor; + int n_merge = params.get_scale_factor_per_side(); int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { From 2892e0fb0c684bb91c944e1ea4cab416d1793162 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 15:30:29 +0100 Subject: [PATCH 07/16] fix out of bound composite --- tools/mtmd/clip.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b18f56fe57701..4bfe68820f94e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -572,13 +572,13 @@ struct clip_graph { } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { // pixel_shuffle // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.get_scale_factor_per_side(); + const int scale_factor = model.hparams.proj_scale_factor; cur = build_patch_merge_permute(cur, scale_factor); cur = ggml_mul_mat(ctx0, model.projection, cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block - const int scale_factor = model.hparams.get_scale_factor_per_side(); + const int scale_factor = model.hparams.proj_scale_factor; cur = build_patch_merge_permute(cur, scale_factor); // projection @@ -3570,10 +3570,17 @@ struct img_tool { static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { for (int y = 0; y < src.ny; ++y) { for (int x = 0; x < src.nx; ++x) { - for (int c = 0; c < 3; ++c) { - dst.buf[3 * ((y + offset_y) * dst.nx + (x + offset_x)) + c] = - src.buf[3 * (y * src.nx + x) + c]; + int dx = x + offset_x; + int dy = y + offset_y; + // skip pixels that would be out of bounds in the destination + if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + continue; } + size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); + size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); + dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; } } } From 42744178a270256da1f49e9c824a523520b44e60 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 15:35:54 +0100 Subject: [PATCH 08/16] fix (2) --- tools/mtmd/clip.cpp | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 4bfe68820f94e..2442064610501 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3546,12 +3546,12 @@ struct img_tool { const int width = inp_size.width; const int height = inp_size.height; - auto round_by_factor = [f = align_size](float x) { return static_cast(std::nearbyintf(x / static_cast(f))) * f; }; auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; - int h_bar = std::max(align_size, round_by_factor(height)); - int w_bar = std::max(align_size, round_by_factor(width)); + // always align up first + int h_bar = std::max(align_size, ceil_by_factor(height)); + int w_bar = std::max(align_size, ceil_by_factor(width)); if (h_bar * w_bar > max_pixels) { const auto beta = std::sqrt(static_cast(height * width) / max_pixels); @@ -4030,7 +4030,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: { - // step 1: make a blank canvas which aligns with grid + // step 1: make a blank canvas which aligns to the grid clip_image_u8 canvas; const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( original_size, @@ -4042,22 +4042,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str canvas.buf.resize(3 * canvas.nx * canvas.ny); img_tool::fill(canvas, {0, 0, 0}); - // step 2: resize original image to fit into the canvas - const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio( - original_size, - 1, // avoid distorting which causes bbox misalignment - params.image_min_pixels, - params.image_max_pixels); - - if (scaled_size.height != original_size.height || - scaled_size.width != original_size.width) { + // step 2: composite resized image onto the canvas, top-left corner + if (original_size.height > canvas.ny || original_size.width > canvas.nx) { + // need to resize original image first clip_image_u8 resized; + const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio( + original_size, + 1, // no need to align here since we will composite onto canvas + std::min(canvas.nx, canvas.ny)); // fit into the canvas img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR); - // step 3: composite resized image onto the canvas, top-left corner img_tool::composite(canvas, resized, 0, 0); } else { // no resizing needed - // step 3: composite original image onto the canvas, top-left corner img_tool::composite(canvas, *img, 0, 0); } From 000d1d9d35f012b896e17eb6c9db941080c4cd4d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 15:42:28 +0100 Subject: [PATCH 09/16] fix token calculation --- tools/mtmd/clip.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2442064610501..f1df4d9d04b3a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2371,16 +2371,16 @@ struct clip_graph { // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) // support dynamic resolution - ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int kernel_size) { - GGML_ASSERT(kernel_size > 1); + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); const int n_embd = cur->ne[0]; int width = img.nx / patch_size; int height = img.ny / patch_size; // pad width and height to factor - const int64_t pad_width = CLIP_ALIGN(width, kernel_size) - width; - const int64_t pad_height = CLIP_ALIGN(height, kernel_size) - height; + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); if (pad_width || pad_height) { cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); @@ -2389,11 +2389,11 @@ struct clip_graph { } // unshuffle h - cur = ggml_reshape_3d(ctx0, cur, n_embd * kernel_size, width / kernel_size, height); + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // unshuffle w - cur = ggml_cont_3d(ctx0, cur, n_embd * kernel_size * kernel_size, height / kernel_size, width / kernel_size); + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); @@ -4351,7 +4351,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_LLAMA4: { - n_patches /= ctx->model.hparams.proj_scale_factor; + // both X and Y are downscaled by the scale factor + int scale_factor = ctx->model.hparams.proj_scale_factor; + n_patches /= (scale_factor * scale_factor); } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: From bfd03fb6db78accb91f24b4740fbc126040a53d4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 15:42:59 +0100 Subject: [PATCH 10/16] get_merge_kernel_size() --- tools/mtmd/clip.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f1df4d9d04b3a..4e9629789d6df 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -216,10 +216,10 @@ struct clip_hparams { void set_warmup_n_tokens(int n_tokens) { int n_tok_per_side = static_cast(std::sqrt(n_tokens)); GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side(); + warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size(); } - int get_scale_factor_per_side() const { + int get_merge_kernel_size() const { return static_cast(std::sqrt(proj_scale_factor)); } }; @@ -550,7 +550,7 @@ struct clip_graph { const int batch_size = 1; GGML_ASSERT(n_patches_x == n_patches_y); const int patches_per_image = n_patches_x; - const int kernel_size = hparams.get_scale_factor_per_side(); + const int kernel_size = hparams.get_merge_kernel_size(); cur = ggml_transpose(ctx0, cur); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); @@ -602,7 +602,7 @@ struct clip_graph { } ggml_cgraph * build_pixtral() { - const int n_merge = hparams.get_scale_factor_per_side(); + const int n_merge = hparams.get_merge_kernel_size(); // 2D input positions ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); @@ -4034,7 +4034,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8 canvas; const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_scale_factor_per_side(), + params.patch_size * params.get_merge_kernel_size(), params.image_min_pixels, params.image_max_pixels); canvas.nx = canvas_size.width; @@ -4133,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8 resized_image; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_scale_factor_per_side(), + params.patch_size * params.get_merge_kernel_size(), params.image_min_pixels, params.image_max_pixels); img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); @@ -4164,7 +4164,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_scale_factor_per_side(), + params.patch_size * params.get_merge_kernel_size(), params.image_min_pixels, params.image_max_pixels); const std::array pad_color = {122, 116, 104}; @@ -4359,7 +4359,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_KIMIVL: { // dynamic size - int scale_factor = params.get_scale_factor_per_side(); + int scale_factor = params.get_merge_kernel_size(); int out_patch_size = params.patch_size * scale_factor; int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; @@ -4369,7 +4369,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LIGHTONOCR: { // dynamic size - int n_merge = params.get_scale_factor_per_side(); + int n_merge = params.get_merge_kernel_size(); int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { From 2c0d96075adb9d2fdc28599eefa76f3c537c7133 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 16:06:55 +0100 Subject: [PATCH 11/16] fix llama4 and lfm2 --- tools/mtmd/clip.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 4e9629789d6df..3a33d0f6b0342 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -550,7 +550,7 @@ struct clip_graph { const int batch_size = 1; GGML_ASSERT(n_patches_x == n_patches_y); const int patches_per_image = n_patches_x; - const int kernel_size = hparams.get_merge_kernel_size(); + const int kernel_size = hparams.proj_scale_factor; cur = ggml_transpose(ctx0, cur); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); @@ -578,7 +578,7 @@ struct clip_graph { } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.get_merge_kernel_size(); cur = build_patch_merge_permute(cur, scale_factor); // projection @@ -2715,9 +2715,12 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_LFM2: { + // correct non-standard proj_scale_factor value + int spatial_merge = 2; + get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false); + hparams.proj_scale_factor = spatial_merge * spatial_merge; // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json hparams.set_limit_image_tokens(64, 256); - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: @@ -2765,7 +2768,10 @@ struct clip_model_loader { case PROJECTOR_TYPE_LLAMA4: { hparams.rope_theta = 10000.0f; - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor); + // correct non-standard proj_scale_factor value + int spatial_merge = 2; + get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false); + hparams.proj_scale_factor = spatial_merge * spatial_merge; set_llava_uhd_res_candidates(model, 3); } break; case PROJECTOR_TYPE_ULTRAVOX: @@ -2785,6 +2791,14 @@ struct clip_model_loader { break; } + // sanity check + { + if (hparams.proj_scale_factor) { + const int n_merge = hparams.get_merge_kernel_size(); + GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor); + } + } + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd); LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head); @@ -4359,7 +4373,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_KIMIVL: { // dynamic size - int scale_factor = params.get_merge_kernel_size(); + int scale_factor = ctx->model.hparams.get_merge_kernel_size(); int out_patch_size = params.patch_size * scale_factor; int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; From 4621d99dbf187744ef1aec2202e8f9e0cdf5ee87 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 31 Oct 2025 16:29:07 +0100 Subject: [PATCH 12/16] gonna fix them all --- tools/mtmd/clip.cpp | 88 +++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3a33d0f6b0342..355b4243565ed 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -174,7 +174,7 @@ struct clip_hparams { int32_t image_longest_edge = 0; int32_t image_min_pixels = 0; int32_t image_max_pixels = 0; - int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2 + int32_t n_merge = 0; // number of patch merges **per-side** float image_mean[3]; float image_std[3]; @@ -207,7 +207,8 @@ struct clip_hparams { int32_t minicpmv_query_num = 0; // MiniCPM-V query number void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { - const int patch_area = patch_size * patch_size * proj_scale_factor; + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; image_min_pixels = n_tokens_min * patch_area; image_max_pixels = n_tokens_max * patch_area; warmup_image_size = static_cast(std::sqrt(image_max_pixels)); @@ -216,11 +217,8 @@ struct clip_hparams { void set_warmup_n_tokens(int n_tokens) { int n_tok_per_side = static_cast(std::sqrt(n_tokens)); GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size(); - } - - int get_merge_kernel_size() const { - return static_cast(std::sqrt(proj_scale_factor)); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; } }; @@ -550,7 +548,7 @@ struct clip_graph { const int batch_size = 1; GGML_ASSERT(n_patches_x == n_patches_y); const int patches_per_image = n_patches_x; - const int kernel_size = hparams.proj_scale_factor; + const int kernel_size = hparams.n_merge; cur = ggml_transpose(ctx0, cur); cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); @@ -572,13 +570,13 @@ struct clip_graph { } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { // pixel_shuffle // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); cur = ggml_mul_mat(ctx0, model.projection, cur); } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { // pixel unshuffle block - const int scale_factor = model.hparams.get_merge_kernel_size(); + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); // projection @@ -602,7 +600,7 @@ struct clip_graph { } ggml_cgraph * build_pixtral() { - const int n_merge = hparams.get_merge_kernel_size(); + const int n_merge = hparams.n_merge; // 2D input positions ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); @@ -628,7 +626,7 @@ struct clip_graph { // mistral small 3.1 patch merger // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 if (model.mm_patch_merger_w) { - GGML_ASSERT(hparams.proj_scale_factor > 0); + GGML_ASSERT(hparams.n_merge > 0); cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); @@ -944,8 +942,7 @@ struct clip_graph { // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.proj_scale_factor > 0 - ? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl // loop over layers for (int il = 0; il < n_layer; il++) { @@ -1168,7 +1165,7 @@ struct clip_graph { // pixel shuffle { - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; const int bsz = 1; // batch size, always 1 for now since we don't support batching const int height = n_patches_y; const int width = n_patches_x; @@ -1258,7 +1255,7 @@ struct clip_graph { // based on Llama4VisionPixelShuffleMLP // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 { - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; const int bsz = 1; // batch size, always 1 for now since we don't support batching GGML_ASSERT(scale_factor > 0); GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images @@ -1330,7 +1327,7 @@ struct clip_graph { { // patch_merger - const int scale_factor = model.hparams.proj_scale_factor; + const int scale_factor = model.hparams.n_merge; cur = build_patch_merge_permute(cur, scale_factor); // projection norm @@ -2706,19 +2703,16 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_INTERNVL: { - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; case PROJECTOR_TYPE_IDEFICS3: { - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); } break; case PROJECTOR_TYPE_LFM2: { - // correct non-standard proj_scale_factor value - int spatial_merge = 2; - get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false); - hparams.proj_scale_factor = spatial_merge * spatial_merge; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json hparams.set_limit_image_tokens(64, 256); } break; @@ -2728,16 +2722,14 @@ struct clip_model_loader { // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json // TODO: verify the image_min_tokens hparams.rope_theta = 10000.0f; - int spatial_merge = 2; - get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); - hparams.proj_scale_factor = spatial_merge * spatial_merge; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(8, 1024); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_KIMIVL: { hparams.rope_theta = 10000.0f; - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // TODO: check kimivl preprocessor for exact values hparams.set_limit_image_tokens(8, 1024); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup @@ -2746,17 +2738,16 @@ struct clip_model_loader { { // default value (used by all model sizes in gemma 3 family) // number of patches for each **side** is reduced by a factor of 4 - hparams.proj_scale_factor = 4; + hparams.n_merge = 4; // test model (tinygemma3) has a different value, we optionally read it - get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: case PROJECTOR_TYPE_QWEN3VL: { - int spatial_merge = 2; - get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false); - hparams.proj_scale_factor = spatial_merge * spatial_merge; + hparams.n_merge = 2; // default value for Qwen 2 and 2.5 + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens @@ -2768,10 +2759,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_LLAMA4: { hparams.rope_theta = 10000.0f; - // correct non-standard proj_scale_factor value - int spatial_merge = 2; - get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false); - hparams.proj_scale_factor = spatial_merge * spatial_merge; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); set_llava_uhd_res_candidates(model, 3); } break; case PROJECTOR_TYPE_ULTRAVOX: @@ -2791,14 +2779,6 @@ struct clip_model_loader { break; } - // sanity check - { - if (hparams.proj_scale_factor) { - const int n_merge = hparams.get_merge_kernel_size(); - GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor); - } - } - LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd); LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head); @@ -2812,11 +2792,8 @@ struct clip_model_loader { LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size); LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); - LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); - if (hparams.proj_scale_factor > 0) { - LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); - } if (hparams.image_min_pixels > 0) { LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels); } @@ -4048,7 +4025,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str clip_image_u8 canvas; const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_merge_kernel_size(), + params.patch_size * params.n_merge, params.image_min_pixels, params.image_max_pixels); canvas.nx = canvas_size.width; @@ -4145,9 +4122,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str { GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); clip_image_u8 resized_image; + // the original pixtral model doesn't have n_merge + const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_merge_kernel_size(), + params.patch_size * cur_merge, params.image_min_pixels, params.image_max_pixels); img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); @@ -4178,7 +4157,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str GGML_ASSERT(params.image_min_pixels && params.image_max_pixels); const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.get_merge_kernel_size(), + params.patch_size * params.n_merge, params.image_min_pixels, params.image_max_pixels); const std::array pad_color = {122, 116, 104}; @@ -4366,15 +4345,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LLAMA4: { // both X and Y are downscaled by the scale factor - int scale_factor = ctx->model.hparams.proj_scale_factor; + int scale_factor = ctx->model.hparams.n_merge; n_patches /= (scale_factor * scale_factor); } break; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: { // dynamic size - int scale_factor = ctx->model.hparams.get_merge_kernel_size(); - int out_patch_size = params.patch_size * scale_factor; + int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; n_patches = x_patch * y_patch; @@ -4383,7 +4361,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_LIGHTONOCR: { // dynamic size - int n_merge = params.get_merge_kernel_size(); + int n_merge = ctx->model.hparams.n_merge; int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); if (ctx->model.token_embd_img_break) { From bae84d4433a4bba97674d3982f8150e6e2b5474f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Nov 2025 00:02:48 +0100 Subject: [PATCH 13/16] use simple resize for qwen --- tools/mtmd/clip.cpp | 71 +++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 51 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 355b4243565ed..be28d1e62b349 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3426,24 +3426,18 @@ struct img_tool { // RESIZE_ALGO_LANCZOS, // TODO }; - enum resize_pad { - RESIZE_PAD_NONE, - RESIZE_PAD_AROUND, - RESIZE_PAD_BOTTOM_RIGHT, - }; - static void resize( const clip_image_u8 & src, clip_image_u8 & dst, const clip_image_size & target_resolution, resize_algo algo, - resize_pad pad_mode = RESIZE_PAD_AROUND, + bool add_padding = true, // TODO: define the behavior for add_padding = false std::array pad_color = {0, 0, 0}) { dst.nx = target_resolution.width; dst.ny = target_resolution.height; dst.buf.resize(3 * dst.nx * dst.ny); - if (pad_mode == RESIZE_PAD_NONE) { + if (!add_padding) { // direct resize switch (algo) { case RESIZE_ALGO_BILINEAR: @@ -3478,15 +3472,8 @@ struct img_tool { // fill dst with pad_color fill(dst, pad_color); - int offset_x = 0; - int offset_y = 0; - if (pad_mode == RESIZE_PAD_AROUND) { - offset_x = (target_resolution.width - new_width) / 2; - offset_y = (target_resolution.height - new_height) / 2; - } else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) { - offset_x = target_resolution.width - new_width; - offset_y = target_resolution.height - new_height; - } + int offset_x = (target_resolution.width - new_width) / 2; + int offset_y = (target_resolution.height - new_height) / 2; composite(dst, resized_image, offset_x, offset_y); } @@ -3523,8 +3510,9 @@ struct img_tool { float target_width_f = static_cast(inp_size.width) * scale; float target_height_f = static_cast(inp_size.height) * scale; - int aligned_width = CLIP_ALIGN((int)target_width_f, align_size); - int aligned_height = CLIP_ALIGN((int)target_height_f, align_size); + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + int aligned_width = ceil_by_factor(target_width_f); + int aligned_height = ceil_by_factor(target_height_f); return {aligned_width, aligned_height}; } @@ -3852,7 +3840,7 @@ struct llava_uhd { } else { // only algo bicubic preserves the ratio; old models rely on this behavior // TODO: do we need to support other algos here? - img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE); + img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false); } // create slices @@ -4022,35 +4010,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_QWEN3VL: { // step 1: make a blank canvas which aligns to the grid - clip_image_u8 canvas; - const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio( + clip_image_u8 resized; + const clip_image_size new_size = img_tool::calc_size_preserved_ratio( original_size, - params.patch_size * params.n_merge, + params.patch_size * 2, params.image_min_pixels, params.image_max_pixels); - canvas.nx = canvas_size.width; - canvas.ny = canvas_size.height; - canvas.buf.resize(3 * canvas.nx * canvas.ny); - img_tool::fill(canvas, {0, 0, 0}); - - // step 2: composite resized image onto the canvas, top-left corner - if (original_size.height > canvas.ny || original_size.width > canvas.nx) { - // need to resize original image first - clip_image_u8 resized; - const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio( - original_size, - 1, // no need to align here since we will composite onto canvas - std::min(canvas.nx, canvas.ny)); // fit into the canvas - img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR); - img_tool::composite(canvas, resized, 0, 0); - } else { - // no resizing needed - img_tool::composite(canvas, *img, 0, 0); - } - + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + // clip_image_save_to_bmp(canvas, "preproc.bmp"); clip_image_f32_ptr img_f32(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init()); - normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); // res_imgs->data[0] = *res; res_imgs->entries.push_back(std::move(img_f32)); } break; @@ -4163,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const std::array pad_color = {122, 116, 104}; clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); @@ -4195,7 +4165,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const std::array pad_color = {122, 116, 104}; // resize the image to the target_size - img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color); + img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); @@ -4268,7 +4238,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * const auto & params = ctx->model.hparams; const int n_total = clip_n_output_tokens(ctx, img); if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0); + return img->nx / (params.patch_size * 2); } return n_total; } @@ -4276,7 +4246,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { const auto & params = ctx->model.hparams; if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) { - return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0); + return img->ny / (params.patch_size * 2); } return 1; } @@ -4334,9 +4304,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_QWEN3VL: { // dynamic size (2 conv, so double patch size) - int patch_size = params.patch_size * 2; - int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0); - int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0); + int x_patch = img->nx / (params.patch_size * 2); + int y_patch = img->ny / (params.patch_size * 2); n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_GEMMA3: From 29c726885cdef823c8f476792977bdc5286d7bb8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Nov 2025 00:07:36 +0100 Subject: [PATCH 14/16] qwen: increase min tokens --- tools/mtmd/clip.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index be28d1e62b349..90fe006272503 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2753,8 +2753,11 @@ struct clip_model_loader { // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens // but we set a lower value to avoid OOM // TODO: make it configurable by user - hparams.set_limit_image_tokens(1, 2048); - hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + // TODO (2): bbox coordinates become inaccurate with small number of tokens, + // therefore we need to increase the min_tokens + // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858 + hparams.set_limit_image_tokens(256, 2048); + hparams.set_warmup_n_tokens(1024); // avoid OOM on warmup } break; case PROJECTOR_TYPE_LLAMA4: { From 00ee52e743f89c2a5ce88c2ebd2059bfb7185898 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Nov 2025 11:42:44 +0100 Subject: [PATCH 15/16] no resize if dst size == src size --- tools/mtmd/clip.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 90fe006272503..6707f0b1addb2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3440,6 +3440,12 @@ struct img_tool { dst.ny = target_resolution.height; dst.buf.resize(3 * dst.nx * dst.ny); + if (dst.nx == src.nx && dst.ny == src.ny) { + // no resize needed, simple copy + dst.buf = src.buf; + return; + } + if (!add_padding) { // direct resize switch (algo) { @@ -4020,7 +4026,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str params.image_min_pixels, params.image_max_pixels); img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); - // clip_image_save_to_bmp(canvas, "preproc.bmp"); + // clip_image_save_to_bmp(resized, "preproc.bmp"); clip_image_f32_ptr img_f32(clip_image_f32_init()); // clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); From a834e1ce56f93cc8d3cb7920524ef6cfeb610981 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 1 Nov 2025 15:14:17 +0100 Subject: [PATCH 16/16] restore to initial min/max tokens value for qwen --- tools/mtmd/clip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 6707f0b1addb2..dcfdb49600b6c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2756,8 +2756,8 @@ struct clip_model_loader { // TODO (2): bbox coordinates become inaccurate with small number of tokens, // therefore we need to increase the min_tokens // see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858 - hparams.set_limit_image_tokens(256, 2048); - hparams.set_warmup_n_tokens(1024); // avoid OOM on warmup + hparams.set_limit_image_tokens(8, 2048); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; case PROJECTOR_TYPE_LLAMA4: {