1212#include < limits>
1313#include < vector>
1414
15+ // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
16+ // models not having it (llava-1.6) will process embeddings without any special tokens in-between
17+ enum mtmd_slice_tmpl {
18+ MTMD_SLICE_TMPL_NONE,
19+ MTMD_SLICE_TMPL_MINICPMV_2_5,
20+ MTMD_SLICE_TMPL_MINICPMV_2_6,
21+ // TODO @ngxson : add support for idefics (SmolVLM)
22+ };
23+
1524struct mtmd_context {
1625 struct clip_ctx * ctx_clip;
1726 const struct llama_model * text_model;
@@ -21,6 +30,16 @@ struct mtmd_context {
2130 int n_threads;
2231 std::string image_marker;
2332
33+ // for minicpmv, we need special tokens in-between slices
34+ mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
35+ llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
36+ llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
37+ llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
38+ llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
39+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
40+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
41+ llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
42+
2443 // TODO @ngxson : add timings
2544
2645 mtmd_context (const char * mmproj_fname,
@@ -38,11 +57,64 @@ struct mtmd_context {
3857 throw std::runtime_error (string_format (" Failed to load CLIP model from %s\n " , mmproj_fname));
3958 }
4059 this ->text_model = text_model;
60+
61+ int minicpmv_version = clip_is_minicpmv (ctx_clip);
62+ if (minicpmv_version == 2 ) {
63+ // minicpmv 2.5 format:
64+ // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
65+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
66+ tok_ov_img_start = lookup_token (" <image>" );
67+ tok_ov_img_end = lookup_token (" </image>" );
68+ tok_slices_start = lookup_token (" <slice>" );
69+ tok_slices_end = lookup_token (" </slice>" );
70+ tok_sli_img_start = tok_ov_img_start;
71+ tok_sli_img_end = tok_ov_img_end;
72+ tok_row_end = lookup_token (" \n " );
73+
74+ } else if (minicpmv_version == 3 || minicpmv_version == 4 ) {
75+ // minicpmv 2.6 format:
76+ // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
77+ slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
78+ tok_ov_img_start = lookup_token (" <image>" );
79+ tok_ov_img_end = lookup_token (" </image>" );
80+ tok_sli_img_start = lookup_token (" <slice>" );
81+ tok_sli_img_end = lookup_token (" </slice>" );
82+ tok_row_end = lookup_token (" \n " );
83+
84+ } else if (minicpmv_version != 0 ) {
85+ GGML_ASSERT (false && " unsupported minicpmv version" );
86+ }
4187 }
4288
4389 ~mtmd_context () {
4490 clip_free (ctx_clip);
4591 }
92+
93+ private:
94+ llama_token lookup_token (const std::string & token_text) {
95+ const llama_vocab * vocab = llama_model_get_vocab (text_model);
96+ const int n_vocab = llama_vocab_n_tokens (vocab);
97+ for (int i = 0 ; i < n_vocab; i++) {
98+ if (token_to_piece (vocab, i, true ) == token_text) {
99+ return i;
100+ }
101+ }
102+ return LLAMA_TOKEN_NULL;
103+ }
104+
105+ std::string token_to_piece (const struct llama_vocab * vocab, llama_token token, bool special) {
106+ std::string piece;
107+ piece.resize (piece.capacity ()); // using string internal cache, 15 bytes + '\n'
108+ const int n_chars = llama_token_to_piece (vocab, token, &piece[0 ], piece.size (), 0 , special);
109+ if (n_chars < 0 ) {
110+ piece.resize (-n_chars);
111+ int check = llama_token_to_piece (vocab, token, &piece[0 ], piece.size (), 0 , special);
112+ GGML_ASSERT (check == -n_chars);
113+ } else {
114+ piece.resize (n_chars);
115+ }
116+ return piece;
117+ }
46118};
47119
48120struct mtmd_image_tokens_data {
@@ -122,6 +194,38 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
122194
123195 size_t i_img = 0 ;
124196
197+ // utility for adding raw tokens
198+ auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
199+ mtmd_input_chunk chunk{
200+ MTMD_INPUT_CHUNK_TYPE_TEXT,
201+ std::move (tokens),
202+ {},
203+ };
204+ output.emplace_back (std::move (chunk));
205+ };
206+
207+ // utility for splitting batch of multiple images into chunks of batch having single images
208+ auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
209+ std::vector<mtmd_input_chunk> chunks;
210+
211+ for (auto & entry : batch_f32.entries ) {
212+ mtmd_image_tokens_ptr image_tokens (new mtmd_image_tokens);
213+ image_tokens->nx = clip_n_patches (ctx->ctx_clip );
214+ image_tokens->ny = 1 ;
215+ image_tokens->batch_f32 .entries .push_back (std::move (entry));
216+ image_tokens->id = id;
217+
218+ mtmd_input_chunk chunk{
219+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
220+ {},
221+ std::move (image_tokens),
222+ };
223+ chunks.emplace_back (std::move (chunk));
224+ }
225+
226+ return chunks;
227+ };
228+
125229 for (const auto & part : parts) {
126230 // printf("tokenizing part: %s\n", part.c_str());
127231 bool add_bos = &parts.front () == ∂
@@ -144,12 +248,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
144248 return 1 ;
145249 }
146250
147- // shim layer
251+ // convert mtmd_bitmap to clip_image_u8
148252 clip_image_u8_ptr img_u8 (clip_image_u8_init ());
149253 img_u8->nx = bitmaps[i_img].nx ;
150254 img_u8->ny = bitmaps[i_img].ny ;
151255 img_u8->buf .resize (bitmaps[i_img].data .size ());
152256 std::memcpy (img_u8->buf .data (), bitmaps[i_img].data .data (), img_u8->nx * img_u8->ny * 3 );
257+ clip_image_size img_u8_size{img_u8->nx , img_u8->ny };
153258
154259 // preprocess image
155260 clip_image_f32_batch batch_f32;
@@ -159,28 +264,70 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
159264 return 2 ;
160265 }
161266
162- mtmd_image_tokens_ptr image_tokens (new mtmd_image_tokens);
163- image_tokens->nx = clip_n_patches (ctx->ctx_clip ) * batch_f32.entries .size (); // TODO @ngxson : use clip_n_patches_by_image
164- image_tokens->ny = 1 ; // TODO
165- image_tokens->batch_f32 = std::move (batch_f32);
166- image_tokens->id = bitmaps[i_img].id ; // optional
167-
168- LOG_DBG (" image_tokens->nx = %d\n " , image_tokens->nx );
169- LOG_DBG (" image_tokens->ny = %d\n " , image_tokens->ny );
170- LOG_DBG (" batch_f32 size = %d\n " , (int )image_tokens->batch_f32 .entries .size ());
171-
172- if (clip_is_glm (ctx->ctx_clip )) {
173- // glm-edge
174- image_tokens->nx += 2 ; // add 2 for the begin_of_image and end_of_image token embeddings
267+ if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
268+ // split batch into chunks of single images
269+ auto chunks = split_batch_to_chunk (std::move (batch_f32), bitmaps[i_img].id );
270+ GGML_ASSERT (chunks.size () > 0 );
271+
272+ // add overview image
273+ add_text_chunk ({ctx->tok_ov_img_start });
274+ output.emplace_back (std::move (chunks.front ()));
275+ chunks.erase (chunks.begin ());
276+ add_text_chunk ({ctx->tok_ov_img_end });
277+
278+ // add slices
279+ if (!chunks.empty ()) {
280+ clip_add_load_image_size (ctx->ctx_clip , &img_u8_size);
281+ int n_col = clip_uhd_num_image_embeds_col (ctx->ctx_clip );
282+ int n_row = (int )chunks.size () / n_col;
283+ GGML_ASSERT (n_row * n_col == (int )chunks.size ());
284+ if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
285+ add_text_chunk ({ctx->tok_slices_start });
286+ }
287+ for (int y = 0 ; y < n_row; y++) {
288+ for (int x = 0 ; x < n_col; x++) {
289+ if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
290+ add_text_chunk ({ctx->tok_sli_img_start });
291+ }
292+ output.emplace_back (std::move (chunks[y * n_col + x]));
293+ if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
294+ add_text_chunk ({ctx->tok_sli_img_end });
295+ }
296+ }
297+ if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1 ) {
298+ add_text_chunk ({ctx->tok_row_end });
299+ }
300+ }
301+ if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
302+ add_text_chunk ({ctx->tok_slices_end });
303+ }
304+ }
305+
306+ } else {
307+ mtmd_image_tokens_ptr image_tokens (new mtmd_image_tokens);
308+ image_tokens->nx = clip_n_patches (ctx->ctx_clip ) * batch_f32.entries .size (); // TODO @ngxson : use clip_n_patches_by_image
309+ image_tokens->ny = 1 ; // TODO
310+ image_tokens->batch_f32 = std::move (batch_f32);
311+ image_tokens->id = bitmaps[i_img].id ; // optional
312+
313+ LOG_DBG (" image_tokens->nx = %d\n " , image_tokens->nx );
314+ LOG_DBG (" image_tokens->ny = %d\n " , image_tokens->ny );
315+ LOG_DBG (" batch_f32 size = %d\n " , (int )image_tokens->batch_f32 .entries .size ());
316+
317+ if (clip_is_glm (ctx->ctx_clip )) {
318+ // glm-edge
319+ image_tokens->nx += 2 ; // add 2 for the begin_of_image and end_of_image token embeddings
320+ }
321+
322+ mtmd_input_chunk chunk{
323+ MTMD_INPUT_CHUNK_TYPE_IMAGE,
324+ {},
325+ std::move (image_tokens),
326+ };
327+ output.emplace_back (std::move (chunk));
175328 }
176329
177- mtmd_input_chunk chunk{
178- MTMD_INPUT_CHUNK_TYPE_IMAGE,
179- {},
180- std::move (image_tokens),
181- };
182- output.emplace_back (std::move (chunk));
183- i_img++;
330+ i_img++; // move to next image
184331 }
185332 }
186333
@@ -214,7 +361,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
214361 ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
215362 bool ok = false ;
216363
217- if (clip_is_llava (ctx->ctx_clip )) {
364+ // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
365+ {
366+ clip_image_size slice_size{
367+ image_tokens->batch_f32 .entries [0 ]->nx ,
368+ image_tokens->batch_f32 .entries [0 ]->ny };
369+ clip_add_load_image_size (ctx->ctx_clip , &slice_size);
370+ }
371+
372+ if (clip_is_llava (ctx->ctx_clip ) || clip_is_minicpmv (ctx->ctx_clip ) || clip_is_glm (ctx->ctx_clip )) {
218373 // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
219374 const auto & entries = image_tokens->batch_f32 .entries ;
220375 for (size_t i = 0 ; i < entries.size (); i++) {
@@ -330,7 +485,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
330485 GGML_ASSERT (chunk.tokens_image != nullptr );
331486 int64_t t0 = ggml_time_ms ();
332487 if (ctx->print_timings ) {
333- LOG_INF (" encoding image...\n " );
488+ LOG_INF (" encoding image or slice ...\n " );
334489 }
335490 ret = mtmd_encode (ctx, chunk.tokens_image .get ());
336491 if (ret != 0 ) {
@@ -339,7 +494,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
339494 return ret;
340495 }
341496 if (ctx->print_timings ) {
342- LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
497+ LOG_INF (" image/slice encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
343498 }
344499
345500 int32_t n_tokens = mtmd_image_tokens_get_n_tokens (chunk.tokens_image .get ());
0 commit comments