Skip to content

Commit 0aef5e9

Browse files
committed
feat: Allow multi-token boundary sequences for image templating
Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 4ef3128 commit 0aef5e9

File tree

1 file changed

+41
-41
lines changed

1 file changed

+41
-41
lines changed

tools/mtmd/mtmd.cpp

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,14 @@ struct mtmd_context {
114114
// for llava-uhd style models, we need special tokens in-between slices
115115
// minicpmv calls them "slices", llama 4 calls them "tiles"
116116
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
117-
llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
118-
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
119-
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
120-
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
121-
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
122-
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
123-
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
124-
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
117+
std::vector<llama_token> tok_ov_img_start; // overview image
118+
std::vector<llama_token> tok_ov_img_end; // overview image
119+
std::vector<llama_token> tok_slices_start; // start of all slices
120+
std::vector<llama_token> tok_slices_end; // end of all slices
121+
std::vector<llama_token> tok_sli_img_start; // single slice start
122+
std::vector<llama_token> tok_sli_img_end; // single slice end
123+
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
124+
std::vector<llama_token> tok_row_end; // end of row
125125
bool tok_row_end_trail = false;
126126
bool ov_img_first = false;
127127

@@ -197,25 +197,25 @@ struct mtmd_context {
197197
// minicpmv 2.5 format:
198198
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
199199
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
200-
tok_ov_img_start = lookup_token("<image>");
201-
tok_ov_img_end = lookup_token("</image>");
202-
tok_slices_start = lookup_token("<slice>");
203-
tok_slices_end = lookup_token("</slice>");
200+
tok_ov_img_start = {lookup_token("<image>")};
201+
tok_ov_img_end = {lookup_token("</image>")};
202+
tok_slices_start = {lookup_token("<slice>")};
203+
tok_slices_end = {lookup_token("</slice>")};
204204
tok_sli_img_start = tok_ov_img_start;
205205
tok_sli_img_end = tok_ov_img_end;
206-
tok_row_end = lookup_token("\n");
206+
tok_row_end = {lookup_token("\n")};
207207
tok_row_end_trail = false; // no trailing end-of-row token
208208
ov_img_first = true;
209209

210210
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
211211
// minicpmv 2.6 format:
212212
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
213213
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
214-
tok_ov_img_start = lookup_token("<image>");
215-
tok_ov_img_end = lookup_token("</image>");
216-
tok_sli_img_start = lookup_token("<slice>");
217-
tok_sli_img_end = lookup_token("</slice>");
218-
tok_row_end = lookup_token("\n");
214+
tok_ov_img_start = {lookup_token("<image>")};
215+
tok_ov_img_end = {lookup_token("</image>")};
216+
tok_sli_img_start = {lookup_token("<slice>")};
217+
tok_sli_img_end = {lookup_token("</slice>")};
218+
tok_row_end = {lookup_token("\n")};
219219
tok_row_end_trail = false; // no trailing end-of-row token
220220
ov_img_first = true;
221221

@@ -230,9 +230,9 @@ struct mtmd_context {
230230
// <|image|> (overview) <-- overview image is last
231231
// <|image_end|>
232232
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
233-
tok_ov_img_start = lookup_token("<|image|>");
234-
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
235-
tok_row_end = lookup_token("<|tile_y_separator|>");
233+
tok_ov_img_start = {lookup_token("<|image|>")};
234+
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
235+
tok_row_end = {lookup_token("<|tile_y_separator|>")};
236236
tok_row_end_trail = true; // add trailing end-of-row token
237237
ov_img_first = false; // overview image is last
238238
}
@@ -517,52 +517,52 @@ struct mtmd_tokenizer {
517517

518518
// add overview image (first)
519519
if (ctx->ov_img_first) {
520-
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
521-
add_text({ctx->tok_ov_img_start});
520+
if (!ctx->tok_ov_img_start.empty()) {
521+
add_text(ctx->tok_ov_img_start);
522522
}
523523
cur.entries.emplace_back(std::move(ov_chunk));
524-
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
525-
add_text({ctx->tok_ov_img_end});
524+
if (!ctx->tok_ov_img_end.empty()) {
525+
add_text(ctx->tok_ov_img_end);
526526
}
527527
}
528528

529529
// add slices (or tiles)
530530
if (!chunks.empty()) {
531531
GGML_ASSERT((int)chunks.size() == n_row * n_col);
532-
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
533-
add_text({ctx->tok_slices_start});
532+
if (!ctx->tok_slices_start.empty()) {
533+
add_text(ctx->tok_slices_start);
534534
}
535535
for (int y = 0; y < n_row; y++) {
536536
for (int x = 0; x < n_col; x++) {
537537
const bool is_last_in_row = (x == n_col - 1);
538-
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
539-
add_text({ctx->tok_sli_img_start});
538+
if (!ctx->tok_sli_img_start.empty()) {
539+
add_text(ctx->tok_sli_img_start);
540540
}
541541
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
542-
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
543-
add_text({ctx->tok_sli_img_end});
542+
if (!ctx->tok_sli_img_end.empty()) {
543+
add_text(ctx->tok_sli_img_end);
544544
}
545-
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
546-
add_text({ctx->tok_sli_img_mid});
545+
if (!is_last_in_row && !ctx->tok_sli_img_mid.empty()) {
546+
add_text(ctx->tok_sli_img_mid);
547547
}
548548
}
549-
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
550-
add_text({ctx->tok_row_end});
549+
if ((y != n_row - 1 || ctx->tok_row_end_trail) && !ctx->tok_row_end.empty()) {
550+
add_text(ctx->tok_row_end);
551551
}
552552
}
553-
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
554-
add_text({ctx->tok_slices_end});
553+
if (!ctx->tok_slices_end.empty()) {
554+
add_text(ctx->tok_slices_end);
555555
}
556556
}
557557

558558
// add overview image (last)
559559
if (!ctx->ov_img_first) {
560-
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
561-
add_text({ctx->tok_ov_img_start});
560+
if (!ctx->tok_ov_img_start.empty()) {
561+
add_text(ctx->tok_ov_img_start);
562562
}
563563
cur.entries.emplace_back(std::move(ov_chunk));
564-
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
565-
add_text({ctx->tok_ov_img_end});
564+
if (!ctx->tok_ov_img_end.empty()) {
565+
add_text(ctx->tok_ov_img_end);
566566
}
567567
}
568568

0 commit comments

Comments
 (0)