Skip to content

Commit 262364e

Browse files
authored
mtmd: Implement tiling for LFM2-VL (ggml-org#19454)
1 parent 820ebfa commit 262364e

File tree

2 files changed

+147
-8
lines changed

2 files changed

+147
-8
lines changed

tools/mtmd/clip.cpp

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "ggml-backend.h"
1111
#include "gguf.h"
1212

13+
#include <algorithm>
1314
#include <cassert>
1415
#include <cmath>
1516
#include <cstdlib>
@@ -1116,9 +1117,8 @@ struct clip_model_loader {
11161117
case PROJECTOR_TYPE_LFM2:
11171118
{
11181119
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1119-
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
1120-
// config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
1121-
hparams.set_limit_image_tokens(64, 1024);
1120+
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
1121+
hparams.set_limit_image_tokens(64, 256);
11221122
} break;
11231123
case PROJECTOR_TYPE_PIXTRAL:
11241124
case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2807,6 +2807,119 @@ struct llava_uhd {
28072807
}
28082808
};
28092809

2810+
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
2811+
// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
2812+
struct lfm2_vl_image_processor {
2813+
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
2814+
static constexpr int min_tiles = 2;
2815+
static constexpr int max_tiles = 10;
2816+
static constexpr float max_pixels_tolerance = 2.0f;
2817+
static constexpr int tile_size = 512;
2818+
2819+
static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
2820+
llava_uhd::slice_instructions inst;
2821+
const auto & params = ctx->model.hparams;
2822+
const int align_size = params.patch_size * params.n_merge;
2823+
2824+
inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
2825+
inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;
2826+
inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
2827+
2828+
// tile if either dimension exceeds tile_size with tolerance
2829+
const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
2830+
2831+
if (!needs_tiling) {
2832+
inst.refined_size = clip_image_size{0, 0};
2833+
inst.grid_size = clip_image_size{0, 0};
2834+
return inst;
2835+
}
2836+
2837+
const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
2838+
2839+
inst.grid_size = grid;
2840+
inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
2841+
2842+
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
2843+
__func__,
2844+
original_size.width, original_size.height,
2845+
inst.overview_size.width, inst.overview_size.height,
2846+
inst.refined_size.width, inst.refined_size.height,
2847+
grid.width, grid.height);
2848+
2849+
for (int row = 0; row < grid.height; row++) {
2850+
for (int col = 0; col < grid.width; col++) {
2851+
llava_uhd::slice_coordinates slice;
2852+
slice.x = col * tile_size;
2853+
slice.y = row * tile_size;
2854+
slice.size = clip_image_size{tile_size, tile_size};
2855+
inst.slices.push_back(slice);
2856+
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
2857+
__func__, (int)inst.slices.size() - 1,
2858+
slice.x, slice.y, slice.size.width, slice.size.height);
2859+
}
2860+
}
2861+
2862+
return inst;
2863+
}
2864+
2865+
private:
2866+
static clip_image_size find_closest_aspect_ratio(
2867+
float aspect_ratio,
2868+
const std::vector<clip_image_size> & target_ratios,
2869+
int width, int height) {
2870+
float best_ratio_diff = std::numeric_limits<float>::max();
2871+
clip_image_size best_ratio = {1, 1};
2872+
const float area = static_cast<float>(width * height);
2873+
2874+
for (const auto & ratio : target_ratios) {
2875+
const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
2876+
const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
2877+
if (ratio_diff < best_ratio_diff) {
2878+
best_ratio_diff = ratio_diff;
2879+
best_ratio = ratio;
2880+
} else if (ratio_diff == best_ratio_diff) {
2881+
const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
2882+
if (area > 0.5f * target_area) {
2883+
best_ratio = ratio;
2884+
}
2885+
}
2886+
}
2887+
return best_ratio;
2888+
}
2889+
2890+
static std::vector<clip_image_size> get_target_ratios() {
2891+
std::vector<clip_image_size> ratios;
2892+
for (int n = min_tiles; n <= max_tiles; n++) {
2893+
for (int w = 1; w <= n; w++) {
2894+
for (int h = 1; h <= n; h++) {
2895+
if (w * h >= min_tiles && w * h <= max_tiles) {
2896+
bool found = false;
2897+
for (const auto & r : ratios) {
2898+
if (r.width == w && r.height == h) {
2899+
found = true;
2900+
break;
2901+
}
2902+
}
2903+
if (!found) {
2904+
ratios.push_back({w, h});
2905+
}
2906+
}
2907+
}
2908+
}
2909+
}
2910+
std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
2911+
return a.width * a.height < b.width * b.height;
2912+
});
2913+
return ratios;
2914+
}
2915+
2916+
static clip_image_size get_grid_layout(int height, int width) {
2917+
const float aspect_ratio = static_cast<float>(width) / height;
2918+
const auto ratios = get_target_ratios();
2919+
return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
2920+
}
2921+
};
2922+
28102923
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
28112924
// res_imgs memory is being allocated here, previous allocations will be freed if found
28122925
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -3021,6 +3134,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30213134
} break;
30223135

30233136
case PROJECTOR_TYPE_LFM2:
3137+
{
3138+
auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
3139+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3140+
3141+
for (size_t i = 0; i < imgs.size(); ++i) {
3142+
clip_image_f32_ptr res(clip_image_f32_init());
3143+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3144+
res_imgs->entries.push_back(std::move(res));
3145+
}
3146+
3147+
res_imgs->grid_x = inst.grid_size.width;
3148+
res_imgs->grid_y = inst.grid_size.height;
3149+
} break;
3150+
30243151
case PROJECTOR_TYPE_KIMIVL:
30253152
{
30263153
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
@@ -3032,8 +3159,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30323159
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
30333160

30343161
clip_image_u8 resized_img;
3035-
const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2);
3036-
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
3162+
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
30373163
clip_image_f32_ptr res(clip_image_f32_init());
30383164
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
30393165
res_imgs->entries.push_back(std::move(res));

tools/mtmd/mtmd.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ enum mtmd_slice_tmpl {
8585
MTMD_SLICE_TMPL_MINICPMV_2_6,
8686
MTMD_SLICE_TMPL_LLAMA4,
8787
MTMD_SLICE_TMPL_IDEFICS3,
88+
MTMD_SLICE_TMPL_LFM2,
8889
};
8990

9091
const char * mtmd_default_marker() {
@@ -307,9 +308,19 @@ struct mtmd_context {
307308
img_end = "<|im_end|>";
308309

309310
} else if (proj == PROJECTOR_TYPE_LFM2) {
310-
img_beg = "<|image_start|>";
311-
img_end = "<|image_end|>";
312-
311+
// multi-tile:
312+
// <|image_start|>
313+
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
314+
// <|img_thumbnail|> (thumbnail)
315+
// <|image_end|>
316+
// single-tile:
317+
// <|image_start|> (image) <|image_end|>
318+
img_beg = "<|image_start|>";
319+
img_end = "<|image_end|>";
320+
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
321+
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
322+
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
323+
ov_img_first = false;
313324
} else if (proj == PROJECTOR_TYPE_GLM4V) {
314325
img_beg = "<|begin_of_image|>";
315326
img_end = "<|end_of_image|>";
@@ -562,11 +573,13 @@ struct mtmd_tokenizer {
562573
}
563574

564575
// handle llava-uhd style preprocessing
576+
const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
565577
if (
566578
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
567579
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
568580
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
569581
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
582+
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
570583
) {
571584
const int n_col = batch_f32.grid_x;
572585
const int n_row = batch_f32.grid_y;

0 commit comments

Comments
 (0)