@@ -114,14 +114,14 @@ struct mtmd_context {
114
114
// for llava-uhd style models, we need special tokens in-between slices
115
115
// minicpmv calls them "slices", llama 4 calls them "tiles"
116
116
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
117
- llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
118
- llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
119
- llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
120
- llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
121
- llama_token tok_sli_img_start = LLAMA_TOKEN_NULL ; // single slice start
122
- llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
123
- llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
124
- llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
117
+ std::vector< llama_token> tok_ov_img_start; // overview image
118
+ std::vector< llama_token> tok_ov_img_end; // overview image
119
+ std::vector< llama_token> tok_slices_start; // start of all slices
120
+ std::vector< llama_token> tok_slices_end; // end of all slices
121
+ std::vector< llama_token> tok_sli_img_start; // single slice start
122
+ std::vector< llama_token> tok_sli_img_end; // single slice end
123
+ std::vector< llama_token> tok_sli_img_mid; // between 2 slices
124
+ std::vector< llama_token> tok_row_end; // end of row
125
125
bool tok_row_end_trail = false ;
126
126
bool ov_img_first = false ;
127
127
@@ -197,25 +197,25 @@ struct mtmd_context {
197
197
// minicpmv 2.5 format:
198
198
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
199
199
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
200
- tok_ov_img_start = lookup_token (" <image>" );
201
- tok_ov_img_end = lookup_token (" </image>" );
202
- tok_slices_start = lookup_token (" <slice>" );
203
- tok_slices_end = lookup_token (" </slice>" );
200
+ tok_ov_img_start = { lookup_token (" <image>" )} ;
201
+ tok_ov_img_end = { lookup_token (" </image>" )} ;
202
+ tok_slices_start = { lookup_token (" <slice>" )} ;
203
+ tok_slices_end = { lookup_token (" </slice>" )} ;
204
204
tok_sli_img_start = tok_ov_img_start;
205
205
tok_sli_img_end = tok_ov_img_end;
206
- tok_row_end = lookup_token (" \n " );
206
+ tok_row_end = { lookup_token (" \n " )} ;
207
207
tok_row_end_trail = false ; // no trailing end-of-row token
208
208
ov_img_first = true ;
209
209
210
210
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 ) {
211
211
// minicpmv 2.6 format:
212
212
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
213
213
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
214
- tok_ov_img_start = lookup_token (" <image>" );
215
- tok_ov_img_end = lookup_token (" </image>" );
216
- tok_sli_img_start = lookup_token (" <slice>" );
217
- tok_sli_img_end = lookup_token (" </slice>" );
218
- tok_row_end = lookup_token (" \n " );
214
+ tok_ov_img_start = { lookup_token (" <image>" )} ;
215
+ tok_ov_img_end = { lookup_token (" </image>" )} ;
216
+ tok_sli_img_start = { lookup_token (" <slice>" )} ;
217
+ tok_sli_img_end = { lookup_token (" </slice>" )} ;
218
+ tok_row_end = { lookup_token (" \n " )} ;
219
219
tok_row_end_trail = false ; // no trailing end-of-row token
220
220
ov_img_first = true ;
221
221
@@ -230,9 +230,9 @@ struct mtmd_context {
230
230
// <|image|> (overview) <-- overview image is last
231
231
// <|image_end|>
232
232
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
233
- tok_ov_img_start = lookup_token (" <|image|>" );
234
- tok_sli_img_mid = lookup_token (" <|tile_x_separator|>" );
235
- tok_row_end = lookup_token (" <|tile_y_separator|>" );
233
+ tok_ov_img_start = { lookup_token (" <|image|>" )} ;
234
+ tok_sli_img_mid = { lookup_token (" <|tile_x_separator|>" )} ;
235
+ tok_row_end = { lookup_token (" <|tile_y_separator|>" )} ;
236
236
tok_row_end_trail = true ; // add trailing end-of-row token
237
237
ov_img_first = false ; // overview image is last
238
238
}
@@ -517,52 +517,52 @@ struct mtmd_tokenizer {
517
517
518
518
// add overview image (first)
519
519
if (ctx->ov_img_first ) {
520
- if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL ) {
521
- add_text ({ ctx->tok_ov_img_start } );
520
+ if (! ctx->tok_ov_img_start . empty () ) {
521
+ add_text (ctx->tok_ov_img_start );
522
522
}
523
523
cur.entries .emplace_back (std::move (ov_chunk));
524
- if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL ) {
525
- add_text ({ ctx->tok_ov_img_end } );
524
+ if (! ctx->tok_ov_img_end . empty () ) {
525
+ add_text (ctx->tok_ov_img_end );
526
526
}
527
527
}
528
528
529
529
// add slices (or tiles)
530
530
if (!chunks.empty ()) {
531
531
GGML_ASSERT ((int )chunks.size () == n_row * n_col);
532
- if (ctx->tok_slices_start != LLAMA_TOKEN_NULL ) {
533
- add_text ({ ctx->tok_slices_start } );
532
+ if (! ctx->tok_slices_start . empty () ) {
533
+ add_text (ctx->tok_slices_start );
534
534
}
535
535
for (int y = 0 ; y < n_row; y++) {
536
536
for (int x = 0 ; x < n_col; x++) {
537
537
const bool is_last_in_row = (x == n_col - 1 );
538
- if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL ) {
539
- add_text ({ ctx->tok_sli_img_start } );
538
+ if (! ctx->tok_sli_img_start . empty () ) {
539
+ add_text (ctx->tok_sli_img_start );
540
540
}
541
541
cur.entries .emplace_back (std::move (chunks[y * n_col + x]));
542
- if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL ) {
543
- add_text ({ ctx->tok_sli_img_end } );
542
+ if (! ctx->tok_sli_img_end . empty () ) {
543
+ add_text (ctx->tok_sli_img_end );
544
544
}
545
- if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL ) {
546
- add_text ({ ctx->tok_sli_img_mid } );
545
+ if (!is_last_in_row && ! ctx->tok_sli_img_mid . empty () ) {
546
+ add_text (ctx->tok_sli_img_mid );
547
547
}
548
548
}
549
- if ((y != n_row - 1 || ctx->tok_row_end_trail ) && ctx->tok_row_end != LLAMA_TOKEN_NULL ) {
550
- add_text ({ ctx->tok_row_end } );
549
+ if ((y != n_row - 1 || ctx->tok_row_end_trail ) && ! ctx->tok_row_end . empty () ) {
550
+ add_text (ctx->tok_row_end );
551
551
}
552
552
}
553
- if (ctx->tok_slices_end != LLAMA_TOKEN_NULL ) {
554
- add_text ({ ctx->tok_slices_end } );
553
+ if (! ctx->tok_slices_end . empty () ) {
554
+ add_text (ctx->tok_slices_end );
555
555
}
556
556
}
557
557
558
558
// add overview image (last)
559
559
if (!ctx->ov_img_first ) {
560
- if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL ) {
561
- add_text ({ ctx->tok_ov_img_start } );
560
+ if (! ctx->tok_ov_img_start . empty () ) {
561
+ add_text (ctx->tok_ov_img_start );
562
562
}
563
563
cur.entries .emplace_back (std::move (ov_chunk));
564
- if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL ) {
565
- add_text ({ ctx->tok_ov_img_end } );
564
+ if (! ctx->tok_ov_img_end . empty () ) {
565
+ add_text (ctx->tok_ov_img_end );
566
566
}
567
567
}
568
568
0 commit comments