From c1b189ae33ee19c5c90b0aefaa6ceff57471ec3e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 00:33:31 +0100
Subject: [PATCH 01/16] mtmd: refactor preprocessing + support max/min pixels

---
 tools/mtmd/clip-impl.h |   2 +-
 tools/mtmd/clip.cpp    | 611 ++++++++++++++++++++++-------------------
 2 files changed, 335 insertions(+), 278 deletions(-)
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 311a4c9086a53..c7e9498349c1b 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -154,8 +154,8 @@ enum projector_type {
     PROJECTOR_TYPE_LFM2,
     PROJECTOR_TYPE_KIMIVL,
     PROJECTOR_TYPE_LIGHTONOCR,
-    PROJECTOR_TYPE_UNKNOWN,
     PROJECTOR_TYPE_COGVLM,
+    PROJECTOR_TYPE_UNKNOWN,
 };
 
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index b312fda637f3b..79ddd6126f325 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3399,9 +3399,134 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
 
 // set of tools to manupulate images
 // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
-struct image_manipulation {
+struct img_tool {
+    enum resize_algo {
+        RESIZE_ALGO_BILINEAR,
+        RESIZE_ALGO_BICUBIC,
+        // RESIZE_ALGO_LANCZOS, // TODO
+    };
+
+    enum resize_pad {
+        RESIZE_PAD_NONE,
+        RESIZE_PAD_AROUND,
+        RESIZE_PAD_BOTTOM_RIGHT,
+    };
+
+    static void resize(
+            const clip_image_u8 & src,
+            clip_image_u8 & dst,
+            const clip_image_size & target_resolution,
+            resize_algo algo,
+            resize_pad pad_mode = RESIZE_PAD_AROUND,
+            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
+        dst.nx = target_resolution.width;
+        dst.ny = target_resolution.height;
+        dst.buf.resize(3 * dst.nx * dst.ny);
+
+        if (pad_mode == RESIZE_PAD_NONE) {
+            // direct resize
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+        } else {
+            // resize with padding
+            clip_image_u8 resized_image;
+            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
+            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+            float scale = std::min(scale_w, scale_h);
+            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, resized_image, new_width, new_height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, resized_image, new_width, new_height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+
+            // fill dst with pad_color
+            for (size_t i = 0; i < dst.buf.size(); i += 3) {
+                dst.buf[i]     = pad_color[0];
+                dst.buf[i + 1] = pad_color[1];
+                dst.buf[i + 2] = pad_color[2];
+            }
+
+            int offset_x = 0;
+            int offset_y = 0;
+            if (pad_mode == RESIZE_PAD_AROUND) {
+                offset_x = (target_resolution.width  - new_width)  / 2;
+                offset_y = (target_resolution.height - new_height) / 2;
+            } else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) {
+                offset_x = target_resolution.width  - new_width;
+                offset_y = target_resolution.height - new_height;
+            }
+
+            draw_into(dst, resized_image, offset_x, offset_y);
+        }
+    }
+
+    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+        dst.nx = w;
+        dst.ny = h;
+        dst.buf.resize(3 * w * h);
+
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                int src_idx = 3 * ((y + i)*image.nx + (x + j));
+                int dst_idx = 3 * (i*w + j);
+                dst.buf[dst_idx]     = image.buf[src_idx];
+                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+            }
+        }
+    }
+
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will be aligned to the nearest multiple of align_size
+    // if H or W size is larger than max_dimension, it will be resized to max_dimension
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
+        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
+            return {0, 0};
+        }
+
+        float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
+                               static_cast<float>(max_dimension) / inp_size.height);
+
+        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
+        float target_height_f = static_cast<float>(inp_size.height) * scale;
+
+        int aligned_width  = CLIP_ALIGN((int)target_width_f,  align_size);
+        int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
+
+        return {aligned_width, aligned_height};
+    }
+
+private:
+    // draw src image into dst image at offset (offset_x, offset_y)
+    static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
+        for (int y = 0; y < src.ny; ++y) {
+            for (int x = 0; x < src.nx; ++x) {
+                for (int c = 0; c < 3; ++c) {
+                    dst.buf[3 * ((y + offset_y) * dst.nx + (x + offset_x)) + c] =
+                        src.buf[3 * (y * src.nx + x) + c];
+                }
+            }
+        }
+    }
+
     // Bilinear resize function
-    static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
         dst.nx = target_width;
         dst.ny = target_height;
         dst.buf.resize(3 * target_width * target_height);
@@ -3437,7 +3562,7 @@ struct image_manipulation {
 
     // Bicubic resize function
     // part of image will be cropped if the aspect ratio is different
-    static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
         const int nx = img.nx;
         const int ny = img.ny;
 
@@ -3500,93 +3625,6 @@ struct image_manipulation {
         return true;
     }
 
-    // llava-1.6 type of resize_and_pad
-    // if the ratio is not 1:1, padding with pad_color will be applied
-    // pad_color is single channel, default is 0 (black)
-    static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & dst, const clip_image_size & target_resolution, std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
-        int target_width  = target_resolution.width;
-        int target_height = target_resolution.height;
-
-        float scale_w = static_cast<float>(target_width) / image.nx;
-        float scale_h = static_cast<float>(target_height) / image.ny;
-
-        int new_width, new_height;
-
-        if (scale_w < scale_h) {
-            new_width  = target_width;
-            new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
-        } else {
-            new_height = target_height;
-            new_width  = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
-        }
-
-        clip_image_u8 resized_image;
-        bicubic_resize(image, resized_image, new_width, new_height);
-
-        clip_image_u8 padded_image;
-        padded_image.nx = target_width;
-        padded_image.ny = target_height;
-        padded_image.buf.resize(3 * target_width * target_height);
-
-        // Fill the padded image with the fill color
-        for (size_t i = 0; i < padded_image.buf.size(); i += 3) {
-            padded_image.buf[i]     = pad_color[0];
-            padded_image.buf[i + 1] = pad_color[1];
-            padded_image.buf[i + 2] = pad_color[2];
-        }
-
-        // Calculate padding offsets
-        int pad_x = (target_width  - new_width)  / 2;
-        int pad_y = (target_height - new_height) / 2;
-
-        // Copy the resized image into the center of the padded buffer
-        for (int y = 0; y < new_height; ++y) {
-            for (int x = 0; x < new_width; ++x) {
-                for (int c = 0; c < 3; ++c) {
-                    padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
-                }
-            }
-        }
-        dst = std::move(padded_image);
-    }
-
-    static void crop_image(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
-        dst.nx = w;
-        dst.ny = h;
-        dst.buf.resize(3 * w * h);
-
-        for (int i = 0; i < h; ++i) {
-            for (int j = 0; j < w; ++j) {
-                int src_idx = 3 * ((y + i)*image.nx + (x + j));
-                int dst_idx = 3 * (i*w + j);
-                dst.buf[dst_idx]     = image.buf[src_idx];
-                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
-            }
-        }
-    }
-
-    // calculate the size of the **resized** image, while preserving the aspect ratio
-    // the calculated size will be aligned to the nearest multiple of align_size
-    // if H or W size is larger than max_dimension, it will be resized to max_dimension
-    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
-        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
-            return {0, 0};
-        }
-
-        float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
-                               static_cast<float>(max_dimension) / inp_size.height);
-
-        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
-        float target_height_f = static_cast<float>(inp_size.height) * scale;
-
-        int aligned_width  = CLIP_ALIGN((int)target_width_f,  align_size);
-        int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
-
-        return {aligned_width, aligned_height};
-    }
-
-private:
     static inline int clip(int x, int lower, int upper) {
         return std::max(lower, std::min(x, upper));
     }
@@ -3735,10 +3773,11 @@ struct llava_uhd {
 
     static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
         std::vector<clip_image_u8_ptr> output;
+        img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
 
         // resize to overview size
         clip_image_u8_ptr resized_img(clip_image_u8_init());
-        image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size);
+        img_tool::resize(*img, *resized_img, inst.overview_size, interpolation);
         output.push_back(std::move(resized_img));
         if (inst.slices.empty()) {
             // no slices, just return the resized image
@@ -3748,9 +3787,11 @@ struct llava_uhd {
         // resize to refined size
         clip_image_u8_ptr refined_img(clip_image_u8_init());
         if (inst.padding_refined) {
-            image_manipulation::resize_and_pad_image(*img, *refined_img, inst.refined_size);
+            img_tool::resize(*img, *refined_img, inst.refined_size, interpolation);
         } else {
-            image_manipulation::bilinear_resize(*img, *refined_img, inst.refined_size.width, inst.refined_size.height);
+            // only algo bicubic preserves the ratio; old models rely on this behavior
+            // TODO: do we need to support other algos here?
+            img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE);
         }
 
         // create slices
@@ -3761,7 +3802,7 @@ struct llava_uhd {
             int h = slice.size.height;
 
             clip_image_u8_ptr img_slice(clip_image_u8_init());
-            image_manipulation::crop_image(*refined_img, *img_slice, x, y, w, h);
+            img_tool::crop(*refined_img, *img_slice, x, y, w, h);
             output.push_back(std::move(img_slice));
         }
 
@@ -3896,208 +3937,224 @@ struct llava_uhd {
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
     clip_image_size original_size{img->nx, img->ny};
-    bool pad_to_square = true;
     auto & params = ctx->model.hparams;
-    // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
-    if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
-        pad_to_square = false;
-    }
 
-    if (clip_is_minicpmv(ctx)) {
-        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
 
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-            clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-            res_imgs->entries.push_back(std::move(res));
-        }
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
 
-        res_imgs->grid_x = inst.grid_size.width;
-        res_imgs->grid_y = inst.grid_size.height;
-        return true;
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+            {
+                clip_image_u8 resized;
+                auto patch_size = params.patch_size * 2;
+                auto new_size = img_tool::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
+                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR);
+
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                // clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+                // res_imgs->data[0] = *res;
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
 
-    } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
-        clip_image_u8 resized;
-        auto patch_size = params.patch_size * 2;
-        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
-        image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
-
-        clip_image_f32_ptr img_f32(clip_image_f32_init());
-        // clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-        // res_imgs->data[0] = *res;
-        res_imgs->entries.push_back(std::move(img_f32));
-        return true;
-    } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
-        // The refined size has two steps:
-        // 1. Resize w/ aspect-ratio preserving such that the longer side is
-        //      the preprocessor longest size
-        // 2. Resize w/out preserving aspect ratio such that both sides are
-        //      multiples of image_size (always rounding up)
-        //
-        // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
-        const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
-            original_size, params.image_size, params.preproc_image_size);
-        // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
-        //         __func__, original_size.width, original_size.height,
-        //         refined_size.width, refined_size.height);
-
-        llava_uhd::slice_instructions instructions;
-        instructions.overview_size = clip_image_size{params.image_size, params.image_size};
-        instructions.refined_size = refined_size;
-        instructions.grid_size = clip_image_size{
-            static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
-            static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
-        };
-        for (int y = 0; y < refined_size.height; y += params.image_size) {
-            for (int x = 0; x < refined_size.width; x += params.image_size) {
-                // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
-                instructions.slices.push_back(llava_uhd::slice_coordinates{
-                    /* x    */x,
-                    /* y    */y,
-                    /* size */clip_image_size{
-                        std::min(params.image_size, refined_size.width - x),
-                        std::min(params.image_size, refined_size.height - y)
+        case PROJECTOR_TYPE_IDEFICS3:
+            {
+                // The refined size has two steps:
+                // 1. Resize w/ aspect-ratio preserving such that the longer side is
+                //      the preprocessor longest size
+                // 2. Resize w/out preserving aspect ratio such that both sides are
+                //      multiples of image_size (always rounding up)
+                //
+                // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+                const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
+                    original_size, params.image_size, params.preproc_image_size);
+                // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
+                //         __func__, original_size.width, original_size.height,
+                //         refined_size.width, refined_size.height);
+
+                llava_uhd::slice_instructions instructions;
+                instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+                instructions.refined_size = refined_size;
+                instructions.grid_size = clip_image_size{
+                    static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+                    static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+                };
+                for (int y = 0; y < refined_size.height; y += params.image_size) {
+                    for (int x = 0; x < refined_size.width; x += params.image_size) {
+                        // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
+                        instructions.slices.push_back(llava_uhd::slice_coordinates{
+                            /* x    */x,
+                            /* y    */y,
+                            /* size */clip_image_size{
+                                std::min(params.image_size, refined_size.width - x),
+                                std::min(params.image_size, refined_size.height - y)
+                            }
+                        });
                     }
-                });
-            }
-        }
-        auto imgs = llava_uhd::slice_image(img, instructions);
-
-        // cast and normalize to f32
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-            clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-            res_imgs->entries.push_back(std::move(res));
-        }
-
-        res_imgs->grid_x = instructions.grid_size.width;
-        res_imgs->grid_y = instructions.grid_size.height;
-        return true;
-    } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
-            || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
-            || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
-    ) {
-        clip_image_u8 resized_image;
-        int sz = params.image_size;
-        image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
-        clip_image_f32_ptr img_f32(clip_image_f32_init());
-        //clip_image_save_to_bmp(resized_image, "resized.bmp");
-        normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-        res_imgs->entries.push_back(std::move(img_f32));
-        return true;
-
-    } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL
-            || ctx->proj_type() == PROJECTOR_TYPE_LIGHTONOCR
-    ) {
-        clip_image_u8 resized_image;
-        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
-        image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
-        clip_image_f32_ptr img_f32(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-        res_imgs->entries.push_back(std::move(img_f32));
-        return true;
-
-    } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
-        GGML_ASSERT(!params.image_res_candidates.empty());
-        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-            res_imgs->entries.push_back(std::move(res));
-        }
-
-        res_imgs->grid_x = inst.grid_size.width;
-        res_imgs->grid_y = inst.grid_size.height;
-        return true;
-
-    } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
-             || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
-    ) {
-        GGML_ASSERT(params.proj_scale_factor);
-
-        // smart resize
-        const int width = img->nx;
-        const int height = img->ny;
-        const int total_factor = params.patch_size * params.proj_scale_factor;
-        constexpr int min_image_tokens = 64;
-        constexpr int max_image_tokens = 1024;
-        const float min_pixels = min_image_tokens * total_factor * total_factor;
-        const float max_pixels = max_image_tokens * total_factor * total_factor;
-
-        auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
-        auto ceil_by_factor  = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
-        auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
-
-        int h_bar = std::max(total_factor, round_by_factor(height));
-        int w_bar = std::max(total_factor, round_by_factor(width));
-
-        if (h_bar * w_bar > max_pixels) {
-            const auto beta = std::sqrt((height * width) / max_pixels);
-            h_bar = std::max(total_factor, floor_by_factor(height / beta));
-            w_bar = std::max(total_factor, floor_by_factor(width / beta));
-        } else if (h_bar * w_bar < min_pixels) {
-            const auto beta = std::sqrt(min_pixels / (height * width));
-            h_bar = ceil_by_factor(height * beta);
-            w_bar = ceil_by_factor(width * beta);
-        }
+                }
+                auto imgs = llava_uhd::slice_image(img, instructions);
+
+                // cast and normalize to f32
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
 
-        const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+                res_imgs->grid_x = instructions.grid_size.width;
+                res_imgs->grid_y = instructions.grid_size.height;
+            } break;
 
-        clip_image_u8 resized_img;
-        image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
-        clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
-        res_imgs->entries.push_back(std::move(res));
-        return true;
-    }
+        case PROJECTOR_TYPE_GLM_EDGE:
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
+            {
+                clip_image_u8 resized_image;
+                int sz = params.image_size;
+                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                //clip_image_save_to_bmp(resized_image, "resized.bmp");
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
 
-    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
-    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+        case PROJECTOR_TYPE_PIXTRAL:
+        case PROJECTOR_TYPE_LIGHTONOCR:
+            {
+                clip_image_u8 resized_image;
+                auto new_size = img_tool::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
+                img_tool::resize(*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR);
+                clip_image_f32_ptr img_f32(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(img_f32));
+            } break;
 
-    clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                GGML_ASSERT(!params.image_res_candidates.empty());
+                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
 
-    if (pad_to_square) {
-        // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
-        // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
-        const int longer_side = std::max(img->nx, img->ny);
-        temp->nx = longer_side;
-        temp->ny = longer_side;
-        temp->buf.resize(3 * longer_side * longer_side);
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
 
-        // background color in RGB from LLaVA (this is the mean rgb color * 255)
-        const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+        case PROJECTOR_TYPE_LFM2:
+        case PROJECTOR_TYPE_KIMIVL:
+            {
+                GGML_ASSERT(params.proj_scale_factor);
+                // smart resize
+                const int width = img->nx;
+                const int height = img->ny;
+                const int total_factor = params.patch_size * params.proj_scale_factor;
+                constexpr int min_image_tokens = 64;
+                constexpr int max_image_tokens = 1024;
+                const float min_pixels = min_image_tokens * total_factor * total_factor;
+                const float max_pixels = max_image_tokens * total_factor * total_factor;
+
+                auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
+                auto ceil_by_factor  = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+                auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
+
+                int h_bar = std::max(total_factor, round_by_factor(height));
+                int w_bar = std::max(total_factor, round_by_factor(width));
+
+                if (h_bar * w_bar > max_pixels) {
+                    const auto beta = std::sqrt((height * width) / max_pixels);
+                    h_bar = std::max(total_factor, floor_by_factor(height / beta));
+                    w_bar = std::max(total_factor, floor_by_factor(width / beta));
+                } else if (h_bar * w_bar < min_pixels) {
+                    const auto beta = std::sqrt(min_pixels / (height * width));
+                    h_bar = ceil_by_factor(height * beta);
+                    w_bar = ceil_by_factor(width * beta);
+                }
 
-        // resize the image to the target_size
-        image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
+                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
-        clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
-        res_imgs->entries.push_back(std::move(res));
-        return true;
+                clip_image_u8 resized_img;
+                img_tool::resize(*img, resized_img, clip_image_size{w_bar, h_bar}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
+                res_imgs->entries.push_back(std::move(res));
+            } break;
 
-    } else if (!params.image_res_candidates.empty()) {
-        // "spatial_unpad" with "anyres" processing for llava-1.6
-        auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-        std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
+            {
+                // TODO @ngxson : refactor the code below to avoid duplicated logic
+
+                // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
+                // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+
+                clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
+
+                // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+                if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { // pad_to_square
+                    // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
+                    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
+                    const int longer_side = std::max(img->nx, img->ny);
+                    temp->nx = longer_side;
+                    temp->ny = longer_side;
+                    temp->buf.resize(3 * longer_side * longer_side);
+
+                    // background color in RGB from LLaVA (this is the mean rgb color * 255)
+                    const std::array<uint8_t, 3> pad_color = {122, 116, 104};
+
+                    // resize the image to the target_size
+                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
+
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                    return true;
+
+                } else if (!params.image_res_candidates.empty()) {
+                    // "spatial_unpad" with "anyres" processing for llava-1.6
+                    auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                    std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                    for (size_t i = 0; i < imgs.size(); ++i) {
+                        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+                        clip_image_f32_ptr res(clip_image_f32_init());
+                        normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                        res_imgs->entries.push_back(std::move(res));
+                    }
 
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-            clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-            res_imgs->entries.push_back(std::move(res));
-        }
+                    return true;
+                }
+            } break;
 
-        return true;
-    } else {
-        GGML_ABORT("Unknown image preprocessing type");
+        default:
+            LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
+            return false;
     }
 
+    return true;
 }
 
 ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {

From 13cd2045ab2128af130363df4c5cf15321adc972 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 10:35:04 +0100
Subject: [PATCH 02/16] fix mlp type

---
 tools/mtmd/clip.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 79ddd6126f325..6e0081aee05c9 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4114,7 +4114,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
 
                 // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
-                if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) { // pad_to_square
+                if (params.image_res_candidates.empty()) { // pad_to_square
                     // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
                     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
                     const int longer_side = std::max(img->nx, img->ny);
@@ -4131,9 +4131,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     clip_image_f32_ptr res(clip_image_f32_init());
                     normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
                     res_imgs->entries.push_back(std::move(res));
-                    return true;
 
-                } else if (!params.image_res_candidates.empty()) {
+                } else {
                     // "spatial_unpad" with "anyres" processing for llava-1.6
                     auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
                     std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
@@ -4144,8 +4143,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                         normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
                         res_imgs->entries.push_back(std::move(res));
                     }
-
-                    return true;
                 }
             } break;
 

From 66d5c43d444a5a1f3021f05feab1fd6afcdc0f32 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 11:55:28 +0100
Subject: [PATCH 03/16] implement mix/max pixels

---
 tools/mtmd/clip.cpp | 97 ++++++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 6e0081aee05c9..09e082541b7bb 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -171,7 +171,9 @@ struct clip_hparams {
     int32_t n_head;
     int32_t n_layer;
     // idefics3
-    int32_t preproc_image_size = 0; // aka max_dimension
+    int32_t image_longest_edge = 0;
+    int32_t image_min_pixels = 0;
+    int32_t image_max_pixels = 0;
     int32_t proj_scale_factor = 0;
 
     float image_mean[3];
@@ -204,6 +206,13 @@ struct clip_hparams {
     bool has_llava_projector = false;
     int minicpmv_version = 0;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
+
+    // used by LFM2 and KIMI-VL
+    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
+        const int total_factor = patch_size * proj_scale_factor;
+        image_min_pixels = n_tokens_min * total_factor * total_factor;
+        image_max_pixels = n_tokens_max * total_factor * total_factor;
+    }
 };
 
 struct clip_layer {
@@ -2577,7 +2586,7 @@ struct clip_model_loader {
 
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
-                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
+                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
                 get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                 get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -2686,10 +2695,14 @@ struct clip_model_loader {
                             hparams.minicpmv_version = 2; // default to 2 if not set
                         }
                     } break;
+                case PROJECTOR_TYPE_INTERNVL:
+                    {
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                    } break;
                 case PROJECTOR_TYPE_IDEFICS3:
                 case PROJECTOR_TYPE_LFM2:
-                case PROJECTOR_TYPE_INTERNVL:
                     {
+                        hparams.set_limit_image_tokens(64, 1024);
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
@@ -2697,15 +2710,14 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                         hparams.warmup_image_size = hparams.patch_size * 8;
-                        // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
-                        // ref: https://github.com/ggml-org/llama.cpp/issues/14310
-                        hparams.image_size = 1024;
+                        hparams.set_limit_image_tokens(64, 1024);
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
                     } break;
                 case PROJECTOR_TYPE_KIMIVL:
                     {
                         hparams.rope_theta = 10000.0f;
                         hparams.warmup_image_size = hparams.patch_size * 8;
+                        hparams.set_limit_image_tokens(64, 1024);
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_GEMMA3:
@@ -3494,14 +3506,14 @@ struct img_tool {
 
     // calculate the size of the **resized** image, while preserving the aspect ratio
     // the calculated size will be aligned to the nearest multiple of align_size
-    // if H or W size is larger than max_dimension, it will be resized to max_dimension
-    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
-        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
+    // if H or W size is larger than longest_edge, it will be resized to longest_edge
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
+        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) {
             return {0, 0};
         }
 
-        float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
-                               static_cast<float>(max_dimension) / inp_size.height);
+        float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
+                               static_cast<float>(longest_edge) / inp_size.height);
 
         float target_width_f  = static_cast<float>(inp_size.width)  * scale;
         float target_height_f = static_cast<float>(inp_size.height) * scale;
@@ -3512,6 +3524,33 @@ struct img_tool {
         return {aligned_width, aligned_height};
     }
 
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will have min_pixels <= W*H <= max_pixels
+    // this is referred as "smart_resize" in transformers code
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
+        const int width  = inp_size.width;
+        const int height = inp_size.height;
+
+        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
+        auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
+
+        int h_bar = std::max(align_size, round_by_factor(height));
+        int w_bar = std::max(align_size, round_by_factor(width));
+
+        if (h_bar * w_bar > max_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
+            h_bar = std::max(align_size, floor_by_factor(height / beta));
+            w_bar = std::max(align_size, floor_by_factor(width  / beta));
+        } else if (h_bar * w_bar < min_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
+            h_bar = ceil_by_factor(height * beta);
+            w_bar = ceil_by_factor(width * beta);
+        }
+
+        return {w_bar, h_bar};
+    }
+
 private:
     // draw src image into dst image at offset (offset_x, offset_y)
     static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
@@ -3982,7 +4021,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 //
                 // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
                 const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
-                    original_size, params.image_size, params.preproc_image_size);
+                    original_size, params.image_size, params.image_longest_edge);
                 // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
                 //         __func__, original_size.width, original_size.height,
                 //         refined_size.width, refined_size.height);
@@ -4064,37 +4103,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
             {
-                GGML_ASSERT(params.proj_scale_factor);
-                // smart resize
-                const int width = img->nx;
-                const int height = img->ny;
-                const int total_factor = params.patch_size * params.proj_scale_factor;
-                constexpr int min_image_tokens = 64;
-                constexpr int max_image_tokens = 1024;
-                const float min_pixels = min_image_tokens * total_factor * total_factor;
-                const float max_pixels = max_image_tokens * total_factor * total_factor;
-
-                auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
-                auto ceil_by_factor  = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
-                auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
-
-                int h_bar = std::max(total_factor, round_by_factor(height));
-                int w_bar = std::max(total_factor, round_by_factor(width));
-
-                if (h_bar * w_bar > max_pixels) {
-                    const auto beta = std::sqrt((height * width) / max_pixels);
-                    h_bar = std::max(total_factor, floor_by_factor(height / beta));
-                    w_bar = std::max(total_factor, floor_by_factor(width / beta));
-                } else if (h_bar * w_bar < min_pixels) {
-                    const auto beta = std::sqrt(min_pixels / (height * width));
-                    h_bar = ceil_by_factor(height * beta);
-                    w_bar = ceil_by_factor(width * beta);
-                }
-
+                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * params.proj_scale_factor,
+                    params.image_min_pixels,
+                    params.image_max_pixels);
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                 clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, clip_image_size{w_bar, h_bar}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
                 clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(res));

From c53c566319b6394e7ba61f07e06001c9ae642901 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 12:40:22 +0100
Subject: [PATCH 04/16] improve hparams

---
 tools/mtmd/clip.cpp | 73 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 09e082541b7bb..91aab1f72a3ea 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -174,7 +174,7 @@ struct clip_hparams {
     int32_t image_longest_edge = 0;
     int32_t image_min_pixels = 0;
     int32_t image_max_pixels = 0;
-    int32_t proj_scale_factor = 0;
+    int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2
 
     float image_mean[3];
     float image_std[3];
@@ -196,7 +196,6 @@ struct clip_hparams {
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
-    int32_t spatial_merge_size = 0;
 
     // audio
     int32_t n_mel_bins = 0; // whisper preprocessor
@@ -209,9 +208,16 @@ struct clip_hparams {
 
     // used by LFM2 and KIMI-VL
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
-        const int total_factor = patch_size * proj_scale_factor;
-        image_min_pixels = n_tokens_min * total_factor * total_factor;
-        image_max_pixels = n_tokens_max * total_factor * total_factor;
+        const int patch_area = patch_size * patch_size * proj_scale_factor;
+        image_min_pixels = n_tokens_min * patch_area;
+        image_max_pixels = n_tokens_max * patch_area;
+        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+    }
+
+    void set_warmup_n_tokens(int n_tokens) {
+        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+        warmup_image_size = n_tok_per_side * patch_size * static_cast<int>(std::sqrt(proj_scale_factor));
     }
 };
 
@@ -593,7 +599,7 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_pixtral() {
-        const int n_merge = hparams.spatial_merge_size;
+        const int n_merge = hparams.proj_scale_factor;
 
         // 2D input positions
         ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -619,7 +625,7 @@ struct clip_graph {
         // mistral small 3.1 patch merger
         // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
         if (model.mm_patch_merger_w) {
-            GGML_ASSERT(hparams.spatial_merge_size > 0);
+            GGML_ASSERT(hparams.proj_scale_factor > 0);
 
             cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
 
@@ -935,7 +941,7 @@ struct clip_graph {
 
         // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
         ggml_tensor * deepstack_features = nullptr;
-        const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl
+        const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
 
         // loop over layers
         for (int il = 0; il < n_layer; il++) {
@@ -2700,25 +2706,32 @@ struct clip_model_loader {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_IDEFICS3:
+                    {
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                    } break;
                 case PROJECTOR_TYPE_LFM2:
                     {
-                        hparams.set_limit_image_tokens(64, 1024);
+                        hparams.set_limit_image_tokens(8, 256);
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                 case PROJECTOR_TYPE_LIGHTONOCR:
                     {
                         hparams.rope_theta = 10000.0f;
-                        hparams.warmup_image_size = hparams.patch_size * 8;
-                        hparams.set_limit_image_tokens(64, 1024);
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
+                        int spatial_merge = 2;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
+                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_KIMIVL:
                     {
                         hparams.rope_theta = 10000.0f;
-                        hparams.warmup_image_size = hparams.patch_size * 8;
-                        hparams.set_limit_image_tokens(64, 1024);
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_GEMMA3:
                     {
@@ -2729,29 +2742,15 @@ struct clip_model_loader {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_QWEN2VL:
-                    {
-                        // max image size = sqrt(max_pixels) = 3584
-                        // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
-                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
-                        hparams.image_size = 1024;
-                        hparams.warmup_image_size = hparams.patch_size * 8;
-                    } break;
                 case PROJECTOR_TYPE_QWEN25VL:
-                    {
-                        // max image size = sqrt(max_pixels)
-                        // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
-                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
-                        hparams.image_size = 1024;
-                        hparams.warmup_image_size = hparams.patch_size * 8;
-                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
-                    } break;
                 case PROJECTOR_TYPE_QWEN3VL:
                     {
-                        hparams.image_size = 1024; // still need this?
-                        hparams.warmup_image_size = hparams.patch_size * 8;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
+                        int spatial_merge = 2;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
+                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
+                        hparams.set_limit_image_tokens(8, 1024);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_LLAMA4:
                     {
@@ -2791,8 +2790,8 @@ struct clip_model_loader {
                 LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                 LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
                 LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
-                if (hparams.spatial_merge_size > 0) {
-                    LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size);
+                if (hparams.proj_scale_factor > 0) {
+                    LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
                 }
             } else if (is_audio) {
                 LOG_INF("\n--- audio hparams ---\n");
@@ -4310,7 +4309,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
                 // dynamic size
-                int n_merge = params.spatial_merge_size;
+                int n_merge = params.proj_scale_factor;
                 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
                 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
                 if (ctx->model.token_embd_img_break) {

From 68b1507a6600c5868ab4af3c91f003307815adf0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 14:58:55 +0100
Subject: [PATCH 05/16] better image preproc for qwen

---
 tools/mtmd/clip.cpp | 96 +++++++++++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 24 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 91aab1f72a3ea..0f8338fd6b05a 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -206,7 +206,6 @@ struct clip_hparams {
     int minicpmv_version = 0;
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 
-    // used by LFM2 and KIMI-VL
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
         const int patch_area = patch_size * patch_size * proj_scale_factor;
         image_min_pixels = n_tokens_min * patch_area;
@@ -2592,7 +2591,6 @@ struct clip_model_loader {
 
             if (is_vision) {
                 get_u32(KEY_IMAGE_SIZE, hparams.image_size);
-                get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                 get_u32(KEY_PATCH_SIZE, hparams.patch_size);
                 get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                 get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
@@ -2707,18 +2705,20 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_IDEFICS3:
                     {
-                        hparams.set_limit_image_tokens(8, 1024);
-                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                     } break;
                 case PROJECTOR_TYPE_LFM2:
                     {
-                        hparams.set_limit_image_tokens(8, 256);
+                        // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
+                        hparams.set_limit_image_tokens(64, 256);
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                 case PROJECTOR_TYPE_LIGHTONOCR:
                     {
+                        // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
+                        // TODO: verify the image_min_tokens
                         hparams.rope_theta = 10000.0f;
                         int spatial_merge = 2;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
@@ -2730,6 +2730,7 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        // TODO: check kimivl preprocessor for exact values
                         hparams.set_limit_image_tokens(8, 1024);
                         hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
@@ -2749,7 +2750,11 @@ struct clip_model_loader {
                         get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
                         hparams.proj_scale_factor = spatial_merge * spatial_merge;
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
-                        hparams.set_limit_image_tokens(8, 1024);
+                        // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
+                        // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
+                        // but we set a lower value to avoid OOM
+                        // TODO: make it configurable by user
+                        hparams.set_limit_image_tokens(1, 2048);
                         hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_LLAMA4:
@@ -2791,7 +2796,13 @@ struct clip_model_loader {
                 LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
                 LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
                 if (hparams.proj_scale_factor > 0) {
-                    LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
+                    LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
+                }
+                if (hparams.image_min_pixels > 0) {
+                    LOG_INF("%s: image_min_pixels:   %d\n", __func__, hparams.image_min_pixels);
+                }
+                if (hparams.image_max_pixels > 0) {
+                    LOG_INF("%s: image_max_pixels:   %d\n", __func__, hparams.image_max_pixels);
                 }
             } else if (is_audio) {
                 LOG_INF("\n--- audio hparams ---\n");
@@ -3467,11 +3478,7 @@ struct img_tool {
             }
 
             // fill dst with pad_color
-            for (size_t i = 0; i < dst.buf.size(); i += 3) {
-                dst.buf[i]     = pad_color[0];
-                dst.buf[i + 1] = pad_color[1];
-                dst.buf[i + 2] = pad_color[2];
-            }
+            fill(dst, pad_color);
 
             int offset_x = 0;
             int offset_y = 0;
@@ -3483,7 +3490,7 @@ struct img_tool {
                 offset_y = target_resolution.height - new_height;
             }
 
-            draw_into(dst, resized_image, offset_x, offset_y);
+            composite(dst, resized_image, offset_x, offset_y);
         }
     }
 
@@ -3507,7 +3514,8 @@ struct img_tool {
     // the calculated size will be aligned to the nearest multiple of align_size
     // if H or W size is larger than longest_edge, it will be resized to longest_edge
     static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
-        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || longest_edge <= 0) {
+        GGML_ASSERT(align_size > 0);
+        if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
             return {0, 0};
         }
 
@@ -3527,6 +3535,7 @@ struct img_tool {
     // the calculated size will have min_pixels <= W*H <= max_pixels
     // this is referred as "smart_resize" in transformers code
     static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
+        GGML_ASSERT(align_size > 0);
         const int width  = inp_size.width;
         const int height = inp_size.height;
 
@@ -3550,9 +3559,8 @@ struct img_tool {
         return {w_bar, h_bar};
     }
 
-private:
     // draw src image into dst image at offset (offset_x, offset_y)
-    static void draw_into(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
+    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
         for (int y = 0; y < src.ny; ++y) {
             for (int x = 0; x < src.nx; ++x) {
                 for (int c = 0; c < 3; ++c) {
@@ -3563,6 +3571,16 @@ struct img_tool {
         }
     }
 
+    // fill the image with a solid color
+    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
+        for (size_t i = 0; i < img.buf.size(); i += 3) {
+            img.buf[i]     = color[0];
+            img.buf[i + 1] = color[1];
+            img.buf[i + 2] = color[2];
+        }
+    }
+
+private:
     // Bilinear resize function
     static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
         dst.nx = target_width;
@@ -3998,14 +4016,40 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
             {
-                clip_image_u8 resized;
-                auto patch_size = params.patch_size * 2;
-                auto new_size = img_tool::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR);
+                // step 1: make a blank canvas which aligns with grid
+                clip_image_u8 canvas;
+                const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                canvas.nx = canvas_size.width;
+                canvas.ny = canvas_size.height;
+                canvas.buf.resize(3 * canvas.nx * canvas.ny);
+                img_tool::fill(canvas, {0, 0, 0});
+
+                // step 2: resize original image to fit into the canvas
+                const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    1, // avoid distorting which causes bbox misalignment
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+
+                if (scaled_size.height != original_size.height ||
+                    scaled_size.width  != original_size.width) {
+                    clip_image_u8 resized;
+                    img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
+                    // step 3: composite resized image onto the canvas, top-left corner
+                    img_tool::composite(canvas, resized, 0, 0);
+                } else {
+                    // no resizing needed
+                    // step 3: composite original image onto the canvas, top-left corner
+                    img_tool::composite(canvas, *img, 0, 0);
+                }
 
                 clip_image_f32_ptr img_f32(clip_image_f32_init());
                 // clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
+                normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std);
                 // res_imgs->data[0] = *res;
                 res_imgs->entries.push_back(std::move(img_f32));
             } break;
@@ -4076,8 +4120,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
                 clip_image_u8 resized_image;
-                auto new_size = img_tool::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
-                img_tool::resize(*img, resized_image, new_size, img_tool::RESIZE_ALGO_BILINEAR);
+                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+                    original_size,
+                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
+                    params.image_min_pixels,
+                    params.image_max_pixels);
+                img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
                 clip_image_f32_ptr img_f32(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(img_f32));
@@ -4104,7 +4152,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
             {
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.proj_scale_factor,
+                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};

From 7bd1a011b4b9dc2901af5fe959c984e8b6a1b30a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 15:18:19 +0100
Subject: [PATCH 06/16] fix

---
 tools/mtmd/clip.cpp | 47 ++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 0f8338fd6b05a..b18f56fe57701 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -216,7 +216,11 @@ struct clip_hparams {
     void set_warmup_n_tokens(int n_tokens) {
         int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
         GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        warmup_image_size = n_tok_per_side * patch_size * static_cast<int>(std::sqrt(proj_scale_factor));
+        warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side();
+    }
+
+    int get_scale_factor_per_side() const {
+        return static_cast<int>(std::sqrt(proj_scale_factor));
     }
 };
 
@@ -546,7 +550,7 @@ struct clip_graph {
             const int batch_size = 1;
             GGML_ASSERT(n_patches_x == n_patches_y);
             const int patches_per_image = n_patches_x;
-            const int kernel_size = hparams.proj_scale_factor;
+            const int kernel_size = hparams.get_scale_factor_per_side();
 
             cur = ggml_transpose(ctx0, cur);
             cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -568,13 +572,13 @@ struct clip_graph {
         } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
             // pixel_shuffle
             // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.get_scale_factor_per_side();
             cur = build_patch_merge_permute(cur, scale_factor);
             cur = ggml_mul_mat(ctx0, model.projection, cur);
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
             // pixel unshuffle block
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.get_scale_factor_per_side();
             cur = build_patch_merge_permute(cur, scale_factor);
 
             // projection
@@ -598,7 +602,7 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_pixtral() {
-        const int n_merge = hparams.proj_scale_factor;
+        const int n_merge = hparams.get_scale_factor_per_side();
 
         // 2D input positions
         ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -940,7 +944,8 @@ struct clip_graph {
 
         // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
         ggml_tensor * deepstack_features = nullptr;
-        const int merge_factor = hparams.proj_scale_factor > 0 ? hparams.proj_scale_factor * hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
+        const int merge_factor = hparams.proj_scale_factor > 0
+            ? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
 
         // loop over layers
         for (int il = 0; il < n_layer; il++) {
@@ -2366,16 +2371,16 @@ struct clip_graph {
 
     // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
     // support dynamic resolution
-    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
-        GGML_ASSERT(scale_factor > 1);
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int kernel_size) {
+        GGML_ASSERT(kernel_size > 1);
 
         const int n_embd = cur->ne[0];
         int width  = img.nx / patch_size;
         int height = img.ny / patch_size;
 
         // pad width and height to factor
-        const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
-        const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
+        const int64_t pad_width  = CLIP_ALIGN(width,  kernel_size) - width;
+        const int64_t pad_height = CLIP_ALIGN(height, kernel_size) - height;
         cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
         if (pad_width || pad_height) {
             cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
@@ -2384,11 +2389,11 @@ struct clip_graph {
         }
 
         // unshuffle h
-        cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
+        cur = ggml_reshape_3d(ctx0, cur, n_embd * kernel_size, width / kernel_size, height);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
         // unshuffle w
-        cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
+        cur = ggml_cont_3d(ctx0, cur, n_embd * kernel_size * kernel_size, height / kernel_size, width / kernel_size);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
         cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
@@ -3203,9 +3208,11 @@ struct clip_model_loader {
         if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
             img->nx = hparams.warmup_image_size;
             img->ny = hparams.warmup_image_size;
+            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
         } else {
             img->nx = hparams.warmup_audio_size;
             img->ny = hparams.n_mel_bins;
+            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
         }
         batch.entries.push_back(std::move(img));
 
@@ -4020,7 +4027,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 clip_image_u8 canvas;
                 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
+                    params.patch_size * params.get_scale_factor_per_side(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 canvas.nx = canvas_size.width;
@@ -4119,10 +4126,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_PIXTRAL:
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
+                GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
                 clip_image_u8 resized_image;
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
+                    params.patch_size * params.get_scale_factor_per_side(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4150,9 +4158,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
             {
+                GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * static_cast<int>(std::sqrt(params.proj_scale_factor)),
+                    params.patch_size * params.get_scale_factor_per_side(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4339,15 +4348,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_INTERNVL:
         case PROJECTOR_TYPE_LLAMA4:
             {
-                // both X and Y are downscaled by the scale factor
-                int scale_factor = ctx->model.hparams.proj_scale_factor;
-                n_patches /= (scale_factor * scale_factor);
+                n_patches /= ctx->model.hparams.proj_scale_factor;
             } break;
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
             {
                 // dynamic size
-                int scale_factor = ctx->model.hparams.proj_scale_factor;
+                int scale_factor = params.get_scale_factor_per_side();
                 int out_patch_size = params.patch_size * scale_factor;
                 int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
                 int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
@@ -4357,7 +4364,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
                 // dynamic size
-                int n_merge = params.proj_scale_factor;
+                int n_merge = params.get_scale_factor_per_side();
                 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
                 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
                 if (ctx->model.token_embd_img_break) {

From 2892e0fb0c684bb91c944e1ea4cab416d1793162 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 15:30:29 +0100
Subject: [PATCH 07/16] fix out of bound composite

---
 tools/mtmd/clip.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index b18f56fe57701..4bfe68820f94e 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -572,13 +572,13 @@ struct clip_graph {
         } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
             // pixel_shuffle
             // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
-            const int scale_factor = model.hparams.get_scale_factor_per_side();
+            const int scale_factor = model.hparams.proj_scale_factor;
             cur = build_patch_merge_permute(cur, scale_factor);
             cur = ggml_mul_mat(ctx0, model.projection, cur);
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
             // pixel unshuffle block
-            const int scale_factor = model.hparams.get_scale_factor_per_side();
+            const int scale_factor = model.hparams.proj_scale_factor;
             cur = build_patch_merge_permute(cur, scale_factor);
 
             // projection
@@ -3570,10 +3570,17 @@ struct img_tool {
     static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
         for (int y = 0; y < src.ny; ++y) {
             for (int x = 0; x < src.nx; ++x) {
-                for (int c = 0; c < 3; ++c) {
-                    dst.buf[3 * ((y + offset_y) * dst.nx + (x + offset_x)) + c] =
-                        src.buf[3 * (y * src.nx + x) + c];
+                int dx = x + offset_x;
+                int dy = y + offset_y;
+                // skip pixels that would be out of bounds in the destination
+                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+                    continue;
                 }
+                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
+                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
+                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
+                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
             }
         }
     }

From 42744178a270256da1f49e9c824a523520b44e60 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 15:35:54 +0100
Subject: [PATCH 08/16] fix (2)

---
 tools/mtmd/clip.cpp | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 4bfe68820f94e..2442064610501 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3546,12 +3546,12 @@ struct img_tool {
         const int width  = inp_size.width;
         const int height = inp_size.height;
 
-        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
         auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
         auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
 
-        int h_bar = std::max(align_size, round_by_factor(height));
-        int w_bar = std::max(align_size, round_by_factor(width));
+        // always align up first
+        int h_bar = std::max(align_size, ceil_by_factor(height));
+        int w_bar = std::max(align_size, ceil_by_factor(width));
 
         if (h_bar * w_bar > max_pixels) {
             const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
@@ -4030,7 +4030,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN25VL:
         case PROJECTOR_TYPE_QWEN3VL:
             {
-                // step 1: make a blank canvas which aligns with grid
+                // step 1: make a blank canvas which aligns to the grid
                 clip_image_u8 canvas;
                 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
                     original_size,
@@ -4042,22 +4042,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 canvas.buf.resize(3 * canvas.nx * canvas.ny);
                 img_tool::fill(canvas, {0, 0, 0});
 
-                // step 2: resize original image to fit into the canvas
-                const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    1, // avoid distorting which causes bbox misalignment
-                    params.image_min_pixels,
-                    params.image_max_pixels);
-
-                if (scaled_size.height != original_size.height ||
-                    scaled_size.width  != original_size.width) {
+                // step 2: composite resized image onto the canvas, top-left corner
+                if (original_size.height > canvas.ny || original_size.width > canvas.nx) {
+                    // need to resize original image first
                     clip_image_u8 resized;
+                    const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
+                        original_size,
+                        1, // no need to align here since we will composite onto canvas
+                        std::min(canvas.nx, canvas.ny)); // fit into the canvas
                     img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
-                    // step 3: composite resized image onto the canvas, top-left corner
                     img_tool::composite(canvas, resized, 0, 0);
                 } else {
                     // no resizing needed
-                    // step 3: composite original image onto the canvas, top-left corner
                     img_tool::composite(canvas, *img, 0, 0);
                 }
 

From 000d1d9d35f012b896e17eb6c9db941080c4cd4d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 15:42:28 +0100
Subject: [PATCH 09/16] fix token calculation

---
 tools/mtmd/clip.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 2442064610501..f1df4d9d04b3a 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2371,16 +2371,16 @@ struct clip_graph {
 
     // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
     // support dynamic resolution
-    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int kernel_size) {
-        GGML_ASSERT(kernel_size > 1);
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
+        GGML_ASSERT(scale_factor > 1);
 
         const int n_embd = cur->ne[0];
         int width  = img.nx / patch_size;
         int height = img.ny / patch_size;
 
         // pad width and height to factor
-        const int64_t pad_width  = CLIP_ALIGN(width,  kernel_size) - width;
-        const int64_t pad_height = CLIP_ALIGN(height, kernel_size) - height;
+        const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
+        const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
         cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
         if (pad_width || pad_height) {
             cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
@@ -2389,11 +2389,11 @@ struct clip_graph {
         }
 
         // unshuffle h
-        cur = ggml_reshape_3d(ctx0, cur, n_embd * kernel_size, width / kernel_size, height);
+        cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
         // unshuffle w
-        cur = ggml_cont_3d(ctx0, cur, n_embd * kernel_size * kernel_size, height / kernel_size, width / kernel_size);
+        cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
         cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 
         cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
@@ -4351,7 +4351,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_INTERNVL:
         case PROJECTOR_TYPE_LLAMA4:
             {
-                n_patches /= ctx->model.hparams.proj_scale_factor;
+                // both X and Y are downscaled by the scale factor
+                int scale_factor = ctx->model.hparams.proj_scale_factor;
+                n_patches /= (scale_factor * scale_factor);
             } break;
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:

From bfd03fb6db78accb91f24b4740fbc126040a53d4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 15:42:59 +0100
Subject: [PATCH 10/16] get_merge_kernel_size()

---
 tools/mtmd/clip.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f1df4d9d04b3a..4e9629789d6df 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -216,10 +216,10 @@ struct clip_hparams {
     void set_warmup_n_tokens(int n_tokens) {
         int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
         GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        warmup_image_size = n_tok_per_side * patch_size * get_scale_factor_per_side();
+        warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size();
     }
 
-    int get_scale_factor_per_side() const {
+    int get_merge_kernel_size() const {
         return static_cast<int>(std::sqrt(proj_scale_factor));
     }
 };
@@ -550,7 +550,7 @@ struct clip_graph {
             const int batch_size = 1;
             GGML_ASSERT(n_patches_x == n_patches_y);
             const int patches_per_image = n_patches_x;
-            const int kernel_size = hparams.get_scale_factor_per_side();
+            const int kernel_size = hparams.get_merge_kernel_size();
 
             cur = ggml_transpose(ctx0, cur);
             cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -602,7 +602,7 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_pixtral() {
-        const int n_merge = hparams.get_scale_factor_per_side();
+        const int n_merge = hparams.get_merge_kernel_size();
 
         // 2D input positions
         ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -4034,7 +4034,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 clip_image_u8 canvas;
                 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_scale_factor_per_side(),
+                    params.patch_size * params.get_merge_kernel_size(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 canvas.nx = canvas_size.width;
@@ -4133,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 clip_image_u8 resized_image;
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_scale_factor_per_side(),
+                    params.patch_size * params.get_merge_kernel_size(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4164,7 +4164,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_scale_factor_per_side(),
+                    params.patch_size * params.get_merge_kernel_size(),
                     params.image_min_pixels,
                     params.image_max_pixels);
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4359,7 +4359,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_KIMIVL:
             {
                 // dynamic size
-                int scale_factor = params.get_scale_factor_per_side();
+                int scale_factor = params.get_merge_kernel_size();
                 int out_patch_size = params.patch_size * scale_factor;
                 int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
                 int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
@@ -4369,7 +4369,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
                 // dynamic size
-                int n_merge = params.get_scale_factor_per_side();
+                int n_merge = params.get_merge_kernel_size();
                 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
                 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
                 if (ctx->model.token_embd_img_break) {

From 2c0d96075adb9d2fdc28599eefa76f3c537c7133 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 16:06:55 +0100
Subject: [PATCH 11/16] fix llama4 and lfm2

---
 tools/mtmd/clip.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 4e9629789d6df..3a33d0f6b0342 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -550,7 +550,7 @@ struct clip_graph {
             const int batch_size = 1;
             GGML_ASSERT(n_patches_x == n_patches_y);
             const int patches_per_image = n_patches_x;
-            const int kernel_size = hparams.get_merge_kernel_size();
+            const int kernel_size = hparams.proj_scale_factor;
 
             cur = ggml_transpose(ctx0, cur);
             cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -578,7 +578,7 @@ struct clip_graph {
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
             // pixel unshuffle block
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.get_merge_kernel_size();
             cur = build_patch_merge_permute(cur, scale_factor);
 
             // projection
@@ -2715,9 +2715,12 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_LFM2:
                     {
+                        // correct non-standard proj_scale_factor value
+                        int spatial_merge = 2;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
+                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
                         // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
                         hparams.set_limit_image_tokens(64, 256);
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                 case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2765,7 +2768,10 @@ struct clip_model_loader {
                 case PROJECTOR_TYPE_LLAMA4:
                     {
                         hparams.rope_theta = 10000.0f;
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
+                        // correct non-standard proj_scale_factor value
+                        int spatial_merge = 2;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
+                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
                         set_llava_uhd_res_candidates(model, 3);
                     } break;
                 case PROJECTOR_TYPE_ULTRAVOX:
@@ -2785,6 +2791,14 @@ struct clip_model_loader {
                     break;
             }
 
+            // sanity check
+            {
+                if (hparams.proj_scale_factor) {
+                    const int n_merge = hparams.get_merge_kernel_size();
+                    GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor);
+                }
+            }
+
             LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
             LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
             LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
@@ -4359,7 +4373,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_KIMIVL:
             {
                 // dynamic size
-                int scale_factor = params.get_merge_kernel_size();
+                int scale_factor = ctx->model.hparams.get_merge_kernel_size();
                 int out_patch_size = params.patch_size * scale_factor;
                 int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
                 int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;

From 4621d99dbf187744ef1aec2202e8f9e0cdf5ee87 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 31 Oct 2025 16:29:07 +0100
Subject: [PATCH 12/16] gonna fix them all

---
 tools/mtmd/clip.cpp | 88 +++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 55 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 3a33d0f6b0342..355b4243565ed 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -174,7 +174,7 @@ struct clip_hparams {
     int32_t image_longest_edge = 0;
     int32_t image_min_pixels = 0;
     int32_t image_max_pixels = 0;
-    int32_t proj_scale_factor = 0; // = (spatial_merge_size)^2
+    int32_t n_merge = 0; // number of patch merges **per-side**
 
     float image_mean[3];
     float image_std[3];
@@ -207,7 +207,8 @@ struct clip_hparams {
     int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
 
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
-        const int patch_area = patch_size * patch_size * proj_scale_factor;
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
         image_min_pixels = n_tokens_min * patch_area;
         image_max_pixels = n_tokens_max * patch_area;
         warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
@@ -216,11 +217,8 @@ struct clip_hparams {
     void set_warmup_n_tokens(int n_tokens) {
         int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
         GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
-        warmup_image_size = n_tok_per_side * patch_size * get_merge_kernel_size();
-    }
-
-    int get_merge_kernel_size() const {
-        return static_cast<int>(std::sqrt(proj_scale_factor));
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
     }
 };
 
@@ -550,7 +548,7 @@ struct clip_graph {
             const int batch_size = 1;
             GGML_ASSERT(n_patches_x == n_patches_y);
             const int patches_per_image = n_patches_x;
-            const int kernel_size = hparams.proj_scale_factor;
+            const int kernel_size = hparams.n_merge;
 
             cur = ggml_transpose(ctx0, cur);
             cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
@@ -572,13 +570,13 @@ struct clip_graph {
         } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
             // pixel_shuffle
             // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.n_merge;
             cur = build_patch_merge_permute(cur, scale_factor);
             cur = ggml_mul_mat(ctx0, model.projection, cur);
 
         } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
             // pixel unshuffle block
-            const int scale_factor = model.hparams.get_merge_kernel_size();
+            const int scale_factor = model.hparams.n_merge;
             cur = build_patch_merge_permute(cur, scale_factor);
 
             // projection
@@ -602,7 +600,7 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_pixtral() {
-        const int n_merge = hparams.get_merge_kernel_size();
+        const int n_merge = hparams.n_merge;
 
         // 2D input positions
         ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -628,7 +626,7 @@ struct clip_graph {
         // mistral small 3.1 patch merger
         // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
         if (model.mm_patch_merger_w) {
-            GGML_ASSERT(hparams.proj_scale_factor > 0);
+            GGML_ASSERT(hparams.n_merge > 0);
 
             cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
 
@@ -944,8 +942,7 @@ struct clip_graph {
 
         // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
         ggml_tensor * deepstack_features = nullptr;
-        const int merge_factor = hparams.proj_scale_factor > 0
-            ? hparams.proj_scale_factor : 4; // default 2x2=4 for qwen3vl
+        const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
 
         // loop over layers
         for (int il = 0; il < n_layer; il++) {
@@ -1168,7 +1165,7 @@ struct clip_graph {
 
         // pixel shuffle
         {
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.n_merge;
             const int bsz    = 1; // batch size, always 1 for now since we don't support batching
             const int height = n_patches_y;
             const int width  = n_patches_x;
@@ -1258,7 +1255,7 @@ struct clip_graph {
         // based on Llama4VisionPixelShuffleMLP
         // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
         {
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.n_merge;
             const int bsz = 1; // batch size, always 1 for now since we don't support batching
             GGML_ASSERT(scale_factor > 0);
             GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
@@ -1330,7 +1327,7 @@ struct clip_graph {
 
         {
             // patch_merger
-            const int scale_factor = model.hparams.proj_scale_factor;
+            const int scale_factor = model.hparams.n_merge;
             cur = build_patch_merge_permute(cur, scale_factor);
 
             // projection norm
@@ -2706,19 +2703,16 @@ struct clip_model_loader {
                     } break;
                 case PROJECTOR_TYPE_INTERNVL:
                     {
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                     } break;
                 case PROJECTOR_TYPE_IDEFICS3:
                     {
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                     } break;
                 case PROJECTOR_TYPE_LFM2:
                     {
-                        // correct non-standard proj_scale_factor value
-                        int spatial_merge = 2;
-                        get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
-                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
                         hparams.set_limit_image_tokens(64, 256);
                     } break;
@@ -2728,16 +2722,14 @@ struct clip_model_loader {
                         // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
                         // TODO: verify the image_min_tokens
                         hparams.rope_theta = 10000.0f;
-                        int spatial_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
-                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         hparams.set_limit_image_tokens(8, 1024);
                         hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_KIMIVL:
                     {
                         hparams.rope_theta = 10000.0f;
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         // TODO: check kimivl preprocessor for exact values
                         hparams.set_limit_image_tokens(8, 1024);
                         hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
@@ -2746,17 +2738,16 @@ struct clip_model_loader {
                     {
                         // default value (used by all model sizes in gemma 3 family)
                         // number of patches for each **side** is reduced by a factor of 4
-                        hparams.proj_scale_factor = 4;
+                        hparams.n_merge = 4;
                         // test model (tinygemma3) has a different value, we optionally read it
-                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                     } break;
                 case PROJECTOR_TYPE_QWEN2VL:
                 case PROJECTOR_TYPE_QWEN25VL:
                 case PROJECTOR_TYPE_QWEN3VL:
                     {
-                        int spatial_merge = 2;
-                        get_u32(KEY_SPATIAL_MERGE_SIZE, spatial_merge, false);
-                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        hparams.n_merge = 2; // default value for Qwen 2 and 2.5
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
                         // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
                         // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
@@ -2768,10 +2759,7 @@ struct clip_model_loader {
                 case PROJECTOR_TYPE_LLAMA4:
                     {
                         hparams.rope_theta = 10000.0f;
-                        // correct non-standard proj_scale_factor value
-                        int spatial_merge = 2;
-                        get_u32(KEY_PROJ_SCALE_FACTOR, spatial_merge, false);
-                        hparams.proj_scale_factor = spatial_merge * spatial_merge;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         set_llava_uhd_res_candidates(model, 3);
                     } break;
                 case PROJECTOR_TYPE_ULTRAVOX:
@@ -2791,14 +2779,6 @@ struct clip_model_loader {
                     break;
             }
 
-            // sanity check
-            {
-                if (hparams.proj_scale_factor) {
-                    const int n_merge = hparams.get_merge_kernel_size();
-                    GGML_ASSERT(n_merge * n_merge == hparams.proj_scale_factor);
-                }
-            }
-
             LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
             LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
             LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
@@ -2812,11 +2792,8 @@ struct clip_model_loader {
                 LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
                 LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                 LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
-                LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
+                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
                 LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
-                if (hparams.proj_scale_factor > 0) {
-                    LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
-                }
                 if (hparams.image_min_pixels > 0) {
                     LOG_INF("%s: image_min_pixels:   %d\n", __func__, hparams.image_min_pixels);
                 }
@@ -4048,7 +4025,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 clip_image_u8 canvas;
                 const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_merge_kernel_size(),
+                    params.patch_size * params.n_merge,
                     params.image_min_pixels,
                     params.image_max_pixels);
                 canvas.nx = canvas_size.width;
@@ -4145,9 +4122,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
             {
                 GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
                 clip_image_u8 resized_image;
+                // the original pixtral model doesn't have n_merge
+                const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_merge_kernel_size(),
+                    params.patch_size * cur_merge,
                     params.image_min_pixels,
                     params.image_max_pixels);
                 img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
@@ -4178,7 +4157,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
                 const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.get_merge_kernel_size(),
+                    params.patch_size * params.n_merge,
                     params.image_min_pixels,
                     params.image_max_pixels);
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
@@ -4366,15 +4345,14 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_LLAMA4:
             {
                 // both X and Y are downscaled by the scale factor
-                int scale_factor = ctx->model.hparams.proj_scale_factor;
+                int scale_factor = ctx->model.hparams.n_merge;
                 n_patches /= (scale_factor * scale_factor);
             } break;
         case PROJECTOR_TYPE_LFM2:
         case PROJECTOR_TYPE_KIMIVL:
             {
                 // dynamic size
-                int scale_factor = ctx->model.hparams.get_merge_kernel_size();
-                int out_patch_size = params.patch_size * scale_factor;
+                int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
                 int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
                 int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
                 n_patches = x_patch * y_patch;
@@ -4383,7 +4361,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_LIGHTONOCR:
             {
                 // dynamic size
-                int n_merge = params.get_merge_kernel_size();
+                int n_merge = ctx->model.hparams.n_merge;
                 int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
                 int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
                 if (ctx->model.token_embd_img_break) {

From bae84d4433a4bba97674d3982f8150e6e2b5474f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 1 Nov 2025 00:02:48 +0100
Subject: [PATCH 13/16] use simple resize for qwen

---
 tools/mtmd/clip.cpp | 71 +++++++++++++--------------------------------
 1 file changed, 20 insertions(+), 51 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 355b4243565ed..be28d1e62b349 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3426,24 +3426,18 @@ struct img_tool {
         // RESIZE_ALGO_LANCZOS, // TODO
     };
 
-    enum resize_pad {
-        RESIZE_PAD_NONE,
-        RESIZE_PAD_AROUND,
-        RESIZE_PAD_BOTTOM_RIGHT,
-    };
-
     static void resize(
             const clip_image_u8 & src,
             clip_image_u8 & dst,
             const clip_image_size & target_resolution,
             resize_algo algo,
-            resize_pad pad_mode = RESIZE_PAD_AROUND,
+            bool add_padding = true, // TODO: define the behavior for add_padding = false
             std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
         dst.nx = target_resolution.width;
         dst.ny = target_resolution.height;
         dst.buf.resize(3 * dst.nx * dst.ny);
 
-        if (pad_mode == RESIZE_PAD_NONE) {
+        if (!add_padding) {
             // direct resize
             switch (algo) {
                 case RESIZE_ALGO_BILINEAR:
@@ -3478,15 +3472,8 @@ struct img_tool {
             // fill dst with pad_color
             fill(dst, pad_color);
 
-            int offset_x = 0;
-            int offset_y = 0;
-            if (pad_mode == RESIZE_PAD_AROUND) {
-                offset_x = (target_resolution.width  - new_width)  / 2;
-                offset_y = (target_resolution.height - new_height) / 2;
-            } else if (pad_mode == RESIZE_PAD_BOTTOM_RIGHT) {
-                offset_x = target_resolution.width  - new_width;
-                offset_y = target_resolution.height - new_height;
-            }
+            int offset_x = (target_resolution.width  - new_width)  / 2;
+            int offset_y = (target_resolution.height - new_height) / 2;
 
             composite(dst, resized_image, offset_x, offset_y);
         }
@@ -3523,8 +3510,9 @@ struct img_tool {
         float target_width_f  = static_cast<float>(inp_size.width)  * scale;
         float target_height_f = static_cast<float>(inp_size.height) * scale;
 
-        int aligned_width  = CLIP_ALIGN((int)target_width_f,  align_size);
-        int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
+        auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        int aligned_width  = ceil_by_factor(target_width_f);
+        int aligned_height = ceil_by_factor(target_height_f);
 
         return {aligned_width, aligned_height};
     }
@@ -3852,7 +3840,7 @@ struct llava_uhd {
         } else {
             // only algo bicubic preserves the ratio; old models rely on this behavior
             // TODO: do we need to support other algos here?
-            img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, img_tool::RESIZE_PAD_NONE);
+            img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
         }
 
         // create slices
@@ -4022,35 +4010,17 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         case PROJECTOR_TYPE_QWEN3VL:
             {
                 // step 1: make a blank canvas which aligns to the grid
-                clip_image_u8 canvas;
-                const clip_image_size canvas_size = img_tool::calc_size_preserved_ratio(
+                clip_image_u8 resized;
+                const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
                     original_size,
-                    params.patch_size * params.n_merge,
+                    params.patch_size * 2,
                     params.image_min_pixels,
                     params.image_max_pixels);
-                canvas.nx = canvas_size.width;
-                canvas.ny = canvas_size.height;
-                canvas.buf.resize(3 * canvas.nx * canvas.ny);
-                img_tool::fill(canvas, {0, 0, 0});
-
-                // step 2: composite resized image onto the canvas, top-left corner
-                if (original_size.height > canvas.ny || original_size.width > canvas.nx) {
-                    // need to resize original image first
-                    clip_image_u8 resized;
-                    const clip_image_size scaled_size = img_tool::calc_size_preserved_ratio(
-                        original_size,
-                        1, // no need to align here since we will composite onto canvas
-                        std::min(canvas.nx, canvas.ny)); // fit into the canvas
-                    img_tool::resize(*img, resized, scaled_size, img_tool::RESIZE_ALGO_BILINEAR);
-                    img_tool::composite(canvas, resized, 0, 0);
-                } else {
-                    // no resizing needed
-                    img_tool::composite(canvas, *img, 0, 0);
-                }
-
+                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+                // clip_image_save_to_bmp(canvas, "preproc.bmp");
                 clip_image_f32_ptr img_f32(clip_image_f32_init());
                 // clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(canvas, *img_f32, params.image_mean, params.image_std);
+                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
                 // res_imgs->data[0] = *res;
                 res_imgs->entries.push_back(std::move(img_f32));
             } break;
@@ -4163,7 +4133,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                 clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
                 clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(res));
@@ -4195,7 +4165,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                     // resize the image to the target_size
-                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, img_tool::RESIZE_PAD_AROUND, pad_color);
+                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
 
                     clip_image_f32_ptr res(clip_image_f32_init());
                     normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
@@ -4268,7 +4238,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
     const auto & params = ctx->model.hparams;
     const int n_total = clip_n_output_tokens(ctx, img);
     if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
-        return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
+        return img->nx / (params.patch_size * 2);
     }
     return n_total;
 }
@@ -4276,7 +4246,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
 int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->model.hparams;
     if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL) {
-        return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
+        return img->ny / (params.patch_size * 2);
     }
     return 1;
 }
@@ -4334,9 +4304,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_QWEN3VL:
             {
                 // dynamic size (2 conv, so double patch size)
-                int patch_size = params.patch_size * 2;
-                int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
-                int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
+                int x_patch = img->nx / (params.patch_size * 2);
+                int y_patch = img->ny / (params.patch_size * 2);
                 n_patches = x_patch * y_patch;
             } break;
         case PROJECTOR_TYPE_GEMMA3:

From 29c726885cdef823c8f476792977bdc5286d7bb8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 1 Nov 2025 00:07:36 +0100
Subject: [PATCH 14/16] qwen: increase min tokens

---
 tools/mtmd/clip.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index be28d1e62b349..90fe006272503 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2753,8 +2753,11 @@ struct clip_model_loader {
                         // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
                         // but we set a lower value to avoid OOM
                         // TODO: make it configurable by user
-                        hparams.set_limit_image_tokens(1, 2048);
-                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                        // TODO (2): bbox coordinates become inaccurate with small number of tokens,
+                        //           therefore we need to increase the min_tokens
+                        //           see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
+                        hparams.set_limit_image_tokens(256, 2048);
+                        hparams.set_warmup_n_tokens(1024); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_LLAMA4:
                     {

From 00ee52e743f89c2a5ce88c2ebd2059bfb7185898 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 1 Nov 2025 11:42:44 +0100
Subject: [PATCH 15/16] no resize if dst size == src size

---
 tools/mtmd/clip.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 90fe006272503..6707f0b1addb2 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3440,6 +3440,12 @@ struct img_tool {
         dst.ny = target_resolution.height;
         dst.buf.resize(3 * dst.nx * dst.ny);
 
+        if (dst.nx == src.nx && dst.ny == src.ny) {
+            // no resize needed, simple copy
+            dst.buf = src.buf;
+            return;
+        }
+
         if (!add_padding) {
             // direct resize
             switch (algo) {
@@ -4020,7 +4026,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     params.image_min_pixels,
                     params.image_max_pixels);
                 img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-                // clip_image_save_to_bmp(canvas, "preproc.bmp");
+                // clip_image_save_to_bmp(resized, "preproc.bmp");
                 clip_image_f32_ptr img_f32(clip_image_f32_init());
                 // clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);

From a834e1ce56f93cc8d3cb7920524ef6cfeb610981 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 1 Nov 2025 15:14:17 +0100
Subject: [PATCH 16/16] restore to initial min/max tokens value for qwen

---
 tools/mtmd/clip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 6707f0b1addb2..dcfdb49600b6c 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2756,8 +2756,8 @@ struct clip_model_loader {
                         // TODO (2): bbox coordinates become inaccurate with small number of tokens,
                         //           therefore we need to increase the min_tokens
                         //           see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
-                        hparams.set_limit_image_tokens(256, 2048);
-                        hparams.set_warmup_n_tokens(1024); // avoid OOM on warmup
+                        hparams.set_limit_image_tokens(8, 2048);
+                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
                     } break;
                 case PROJECTOR_TYPE_LLAMA4:
                     {