@@ -76,7 +76,7 @@ enum mtmd_slice_tmpl {
76
76
MTMD_SLICE_TMPL_MINICPMV_2_5,
77
77
MTMD_SLICE_TMPL_MINICPMV_2_6,
78
78
MTMD_SLICE_TMPL_LLAMA4,
79
- // TODO @ngxson : add support for idefics (SmolVLM)
79
+ MTMD_SLICE_TMPL_IDEFICS3,
80
80
};
81
81
82
82
const char * mtmd_default_marker () {
@@ -127,6 +127,9 @@ struct mtmd_context {
127
127
128
128
bool use_mrope = false ; // for Qwen2VL, we need to use M-RoPE
129
129
130
+ // string template for slice image delimiters with row/col (idefics3)
131
+ std::string sli_img_start_tmpl;
132
+
130
133
// for whisper, we pre-calculate the mel filter bank
131
134
whisper_preprocessor::whisper_filters w_filters;
132
135
@@ -245,8 +248,12 @@ struct mtmd_context {
245
248
246
249
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
247
250
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
248
- img_beg = " <fake_token_around_image><global-img>" ;
249
- img_end = " <fake_token_around_image>" ;
251
+ slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
252
+ tok_ov_img_start = {lookup_token (" \n " ), lookup_token (" <fake_token_around_image>" ), lookup_token (" <global-img>" )};
253
+ tok_ov_img_end = {lookup_token (" <fake_token_around_image>" )};
254
+ tok_row_end = {lookup_token (" \n " )};
255
+ img_beg = " <fake_token_around_image>" ;
256
+ sli_img_start_tmpl = " <fake_token_around_image><row_%d_col_%d>" ;
250
257
251
258
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
252
259
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
@@ -504,6 +511,7 @@ struct mtmd_tokenizer {
504
511
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
505
512
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
506
513
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
514
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
507
515
) {
508
516
const int n_col = batch_f32.grid_x ;
509
517
const int n_row = batch_f32.grid_y ;
@@ -537,6 +545,12 @@ struct mtmd_tokenizer {
537
545
const bool is_last_in_row = (x == n_col - 1 );
538
546
if (!ctx->tok_sli_img_start .empty ()) {
539
547
add_text (ctx->tok_sli_img_start );
548
+ } else if (!ctx->sli_img_start_tmpl .empty ()) {
549
+ // If using a template to preceed a slice image
550
+ const size_t sz = std::snprintf (nullptr , 0 , ctx->sli_img_start_tmpl .c_str (), y+1 , x+1 ) + 1 ;
551
+ std::unique_ptr<char []> buf (new char [sz]);
552
+ std::snprintf (buf.get (), sz, ctx->sli_img_start_tmpl .c_str (), y+1 , x+1 );
553
+ add_text (std::string (buf.get (), buf.get () + sz - 1 ), true );
540
554
}
541
555
cur.entries .emplace_back (std::move (chunks[y * n_col + x]));
542
556
if (!ctx->tok_sli_img_end .empty ()) {
@@ -780,7 +794,10 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
780
794
ctx->image_embd_v .resize (image_tokens->n_tokens () * n_mmproj_embd);
781
795
bool ok = false ;
782
796
783
- if (clip_is_llava (ctx_clip) || clip_is_minicpmv (ctx_clip) || clip_is_glm (ctx_clip)) {
797
+ if (clip_is_llava (ctx_clip)
798
+ || clip_is_minicpmv (ctx_clip)
799
+ || clip_is_glm (ctx_clip)
800
+ || clip_is_idefics3 (ctx_clip)) {
784
801
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
785
802
const auto & entries = image_tokens->batch_f32 .entries ;
786
803
for (size_t i = 0 ; i < entries.size (); i++) {
0 commit comments