Skip to content

Commit 3645fe0

Browse files
committed
rm load_image_size
1 parent 4217d42 commit 3645fe0

File tree

4 files changed

+26
-38
lines changed

4 files changed

+26
-38
lines changed

tools/mtmd/clip-impl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,11 @@ struct clip_image_u8_batch {
243243
struct clip_image_f32_batch {
244244
std::vector<clip_image_f32_ptr> entries;
245245

246+
// for llava-uhd style models, we need to know the grid size
247+
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
248+
int grid_x = 0;
249+
int grid_y = 0;
250+
246251
clip_image_f32_batch clone() const {
247252
clip_image_f32_batch new_batch;
248253
new_batch.entries.reserve(entries.size());

tools/mtmd/clip.cpp

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,6 @@ struct clip_ctx {
359359
int max_nodes = 8192;
360360
ggml_backend_sched_ptr sched;
361361

362-
clip_image_size load_image_size;
363-
364362
// for debugging
365363
bool debug_graph = false;
366364
std::vector<ggml_tensor *> debug_print_tensors;
@@ -2457,14 +2455,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
24572455
return ctx_clip;
24582456
}
24592457

2460-
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
2461-
ctx_clip->load_image_size = *load_image_size; // copy
2462-
}
2463-
2464-
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
2465-
return &ctx_clip->load_image_size;
2466-
}
2467-
24682458
struct clip_image_size * clip_image_size_init() {
24692459
struct clip_image_size * load_image_size = new struct clip_image_size();
24702460
load_image_size->width = 448;
@@ -3045,12 +3035,6 @@ struct llava_uhd {
30453035
}
30463036
};
30473037

3048-
// TODO @ngxson : decprecate the load_image_size singleton pattern
3049-
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
3050-
const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
3051-
return inst.grid_size.width;
3052-
}
3053-
30543038
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
30553039
// res_imgs memory is being allocated here, previous allocations will be freed if found
30563040
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -3072,9 +3056,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
30723056
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
30733057
res_imgs->entries.push_back(std::move(res));
30743058
}
3059+
3060+
res_imgs->grid_x = inst.grid_size.width;
3061+
res_imgs->grid_y = inst.grid_size.height;
30753062
return true;
3076-
}
3077-
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3063+
3064+
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
30783065
clip_image_u8 resized;
30793066
auto patch_size = params.patch_size * 2;
30803067
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -3122,6 +3109,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
31223109
res_imgs->entries.push_back(std::move(res));
31233110
}
31243111

3112+
res_imgs->grid_x = inst.grid_size.width;
3113+
res_imgs->grid_y = inst.grid_size.height;
31253114
return true;
31263115

31273116
}
@@ -3409,8 +3398,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
34093398
const int patch_size = hparams.patch_size;
34103399
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
34113400
const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
3412-
const int pos_w = ctx->load_image_size.width / patch_size;
3413-
const int pos_h = ctx->load_image_size.height / patch_size;
3401+
const int pos_w = image_size_width / patch_size;
3402+
const int pos_h = image_size_height / patch_size;
34143403

34153404
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
34163405

tools/mtmd/clip.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
4747
// this should be equal to the embedding dimension of the text model
4848
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
4949

50-
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
51-
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
52-
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
53-
5450
struct clip_image_size * clip_image_size_init(void);
5551
struct clip_image_u8 * clip_image_u8_init (void);
5652
struct clip_image_f32 * clip_image_f32_init(void);

tools/mtmd/mtmd.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ enum mtmd_slice_tmpl {
4242
MTMD_SLICE_TMPL_NONE,
4343
MTMD_SLICE_TMPL_MINICPMV_2_5,
4444
MTMD_SLICE_TMPL_MINICPMV_2_6,
45+
MTMD_SLICE_TMPL_LLAMA4,
4546
// TODO @ngxson : add support for idefics (SmolVLM)
4647
};
4748

@@ -65,6 +66,7 @@ struct mtmd_context {
6566
std::string image_marker;
6667

6768
// for llava-uhd style models, we need special tokens in-between slices
69+
// minicpmv calls them "slices", llama 4 calls them "tiles"
6870
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
6971
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
7072
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
@@ -137,6 +139,7 @@ struct mtmd_context {
137139
// ... <|tile_y_separator|> <-- trailing end-of-row token
138140
// <|image|> (overview) <-- overview image is last
139141
// <|image_end|>
142+
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
140143
tok_ov_img_start = lookup_token("<|image|>");
141144
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
142145
tok_row_end = lookup_token("<|tile_y_separator|>");
@@ -361,7 +364,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
361364
return 2;
362365
}
363366

364-
if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
367+
// handle llava-uhd style preprocessing
368+
if (
369+
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
370+
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
371+
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
372+
) {
365373
// split batch into chunks of single images
366374
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
367375
GGML_ASSERT(chunks.size() > 0);
@@ -380,12 +388,10 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
380388
}
381389
}
382390

383-
// add slices
391+
// add slices (or tiles)
384392
if (!chunks.empty()) {
385-
clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
386-
int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
387-
int n_row = (int)chunks.size() / n_col;
388-
GGML_ASSERT(n_row * n_col == (int)chunks.size());
393+
const int n_col = batch_f32.grid_x;
394+
const int n_row = batch_f32.grid_y;
389395
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
390396
add_text_chunk({ctx->tok_slices_start});
391397
}
@@ -473,14 +479,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
473479
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
474480
bool ok = false;
475481

476-
// only effective for minicpmv and qwen2vl, other models will ignore load_image_size
477-
{
478-
clip_image_size slice_size{
479-
image_tokens->batch_f32.entries[0]->nx,
480-
image_tokens->batch_f32.entries[0]->ny};
481-
clip_add_load_image_size(ctx->ctx_clip, &slice_size);
482-
}
483-
484482
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
485483
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
486484
const auto & entries = image_tokens->batch_f32.entries;

0 commit comments

Comments
 (0)