@@ -42,6 +42,7 @@ enum mtmd_slice_tmpl {
4242 MTMD_SLICE_TMPL_NONE,
4343 MTMD_SLICE_TMPL_MINICPMV_2_5,
4444 MTMD_SLICE_TMPL_MINICPMV_2_6,
45+ MTMD_SLICE_TMPL_LLAMA4,
4546 // TODO @ngxson : add support for idefics (SmolVLM)
4647};
4748
@@ -65,6 +66,7 @@ struct mtmd_context {
6566 std::string image_marker;
6667
6768 // for llava-uhd style models, we need special tokens in-between slices
69+ // minicpmv calls them "slices", llama 4 calls them "tiles"
6870 mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
6971 llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
7072 llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
@@ -137,6 +139,7 @@ struct mtmd_context {
137139 // ... <|tile_y_separator|> <-- trailing end-of-row token
138140 // <|image|> (overview) <-- overview image is last
139141 // <|image_end|>
142+ slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
140143 tok_ov_img_start = lookup_token (" <|image|>" );
141144 tok_sli_img_mid = lookup_token (" <|tile_x_separator|>" );
142145 tok_row_end = lookup_token (" <|tile_y_separator|>" );
@@ -361,7 +364,12 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
361364 return 2 ;
362365 }
363366
364- if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
367+ // handle llava-uhd style preprocessing
368+ if (
369+ ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
370+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
371+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
372+ ) {
365373 // split batch into chunks of single images
366374 auto chunks = split_batch_to_chunk (std::move (batch_f32), bitmaps[i_img]->id );
367375 GGML_ASSERT (chunks.size () > 0 );
@@ -380,12 +388,10 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
380388 }
381389 }
382390
383- // add slices
391+ // add slices (or tiles)
384392 if (!chunks.empty ()) {
385- clip_add_load_image_size (ctx->ctx_clip , &img_u8_size);
386- int n_col = clip_uhd_num_image_embeds_col (ctx->ctx_clip );
387- int n_row = (int )chunks.size () / n_col;
388- GGML_ASSERT (n_row * n_col == (int )chunks.size ());
393+ const int n_col = batch_f32.grid_x ;
394+ const int n_row = batch_f32.grid_y ;
389395 if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
390396 add_text_chunk ({ctx->tok_slices_start });
391397 }
@@ -473,14 +479,6 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
473479 ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
474480 bool ok = false ;
475481
476- // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
477- {
478- clip_image_size slice_size{
479- image_tokens->batch_f32 .entries [0 ]->nx ,
480- image_tokens->batch_f32 .entries [0 ]->ny };
481- clip_add_load_image_size (ctx->ctx_clip , &slice_size);
482- }
483-
484482 if (clip_is_llava (ctx->ctx_clip ) || clip_is_minicpmv (ctx->ctx_clip ) || clip_is_glm (ctx->ctx_clip )) {
485483 // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
486484 const auto & entries = image_tokens->batch_f32 .entries ;
0 commit comments