@@ -402,10 +402,33 @@ struct mtmd_tokenizer {
402402 }
403403 } else {
404404 // this is a text part, we should add it as text
405- add_text (part, add_special, parse_special);
405+ add_text (part, parse_special);
406406 }
407407 }
408408
409+ if (add_special && llama_vocab_get_add_bos (vocab)) {
410+ // if first chunk is text, we add BOS token to first text chunk
411+ // otherwise, create a new text chunk with BOS token
412+ if (!cur.entries .empty () && cur.entries [0 ].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
413+ // add BOS token to the beginning of first text chunk
414+ cur.entries [0 ].tokens_text .insert (cur.entries [0 ].tokens_text .begin (), llama_vocab_bos (vocab));
415+ } else {
416+ // create a new text chunk with BOS token at the beginning
417+ mtmd_input_chunk bos_chunk{
418+ MTMD_INPUT_CHUNK_TYPE_TEXT,
419+ {llama_vocab_bos (vocab)},
420+ nullptr , // image tokens
421+ nullptr , // audio tokens
422+ };
423+ cur.entries .insert (cur.entries .begin (), std::move (bos_chunk));
424+ }
425+ }
426+
427+ if (add_special && llama_vocab_get_add_eos (vocab)) {
428+ // if last chunk is text, we add EOS token to it
429+ add_text ({llama_vocab_eos (vocab)});
430+ }
431+
409432 if (i_bm != bitmaps.size ()) {
410433 LOG_ERR (" %s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n " ,
411434 __func__, bitmaps.size (), parts.size () - 1 );
@@ -417,9 +440,9 @@ struct mtmd_tokenizer {
417440 return 0 ;
418441 }
419442
420- void add_text (const std::string & txt, bool add_special, bool parse_special) {
443+ void add_text (const std::string & txt, bool parse_special) {
421444 LOG_DBG (" %s: %s\n " , __func__, txt.c_str ());
422- auto tokens = mtmd_tokenize_text_internal (vocab, txt, add_special, parse_special);
445+ auto tokens = mtmd_tokenize_text_internal (vocab, txt, /* add_special */ false , parse_special);
423446 add_text (tokens);
424447 }
425448
@@ -454,7 +477,7 @@ struct mtmd_tokenizer {
454477 }
455478
456479 if (!ctx->img_beg .empty ()) {
457- add_text (ctx->img_beg , false , true ); // add image begin token
480+ add_text (ctx->img_beg , true ); // add image begin token
458481 }
459482
460483 // convert mtmd_bitmap to clip_image_u8
@@ -571,7 +594,7 @@ struct mtmd_tokenizer {
571594 }
572595
573596 if (!ctx->img_end .empty ()) {
574- add_text (ctx->img_end , false , true ); // add image end token
597+ add_text (ctx->img_end , true ); // add image end token
575598 }
576599
577600 } else {
@@ -588,7 +611,7 @@ struct mtmd_tokenizer {
588611 }
589612
590613 if (!ctx->aud_beg .empty ()) {
591- add_text (ctx->aud_beg , false , true ); // add audio begin token
614+ add_text (ctx->aud_beg , true ); // add audio begin token
592615 }
593616
594617 // preprocess audio
@@ -632,7 +655,7 @@ struct mtmd_tokenizer {
632655 }
633656
634657 if (!ctx->aud_end .empty ()) {
635- add_text (ctx->aud_end , false , true ); // add audio end token
658+ add_text (ctx->aud_end , true ); // add audio end token
636659 }
637660 }
638661
0 commit comments