@@ -86,6 +86,7 @@ mtmd_context_params mtmd_context_params_default() {
8686 params.n_threads = 4 ;
8787 params.verbosity = GGML_LOG_LEVEL_INFO;
8888 params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
89+ params.media_marker = MTMD_DEFAULT_MEDIA_MARKER;
8990 return params;
9091}
9192
@@ -96,7 +97,7 @@ struct mtmd_context {
9697
9798 bool print_timings;
9899 int n_threads;
99- std::string image_marker ;
100+ std::string media_marker ;
100101 bool has_vision;
101102 bool has_audio;
102103
@@ -127,8 +128,12 @@ struct mtmd_context {
127128 text_model (text_model),
128129 print_timings (ctx_params.print_timings),
129130 n_threads (ctx_params.n_threads),
130- image_marker (ctx_params.image_marker )
131+ media_marker (ctx_params.media_marker )
131132 {
133+ if (std::string (ctx_params.image_marker ) != MTMD_DEFAULT_IMAGE_MARKER) {
134+ throw std::runtime_error (" custom image_marker is not supported anymore, use media_marker instead" );
135+ }
136+
132137 clip_context_params ctx_clip_params;
133138 ctx_clip_params.use_gpu = ctx_params.use_gpu ;
134139 ctx_clip_params.verbosity = ctx_params.verbosity ;
@@ -269,48 +274,51 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
269274 auto vocab = llama_model_get_vocab (ctx->text_model );
270275
271276 std::string prompt_modified (text->text );
272- std::string marker_modified (ctx->image_marker );
277+ std::string marker_modified (ctx->media_marker );
273278 projector_type proj_type = clip_get_projector_type (ctx->ctx_clip );
274279
280+ // for compatibility, we convert image marker to media marker
281+ string_replace_all (prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker );
282+
275283 // a bit hacky here, but works for now
276284 // for some models, we need to add prefix and suffix to the image embeddings
277285 if (clip_is_gemma3 (ctx->ctx_clip )) {
278286 // gemma 3
279287 // <start_of_image> ... (image embeddings) ... <end_of_image>
280- marker_modified = " <start_of_image>" + ctx->image_marker + " <end_of_image>" ;
281- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
288+ marker_modified = " <start_of_image>" + ctx->media_marker + " <end_of_image>" ;
289+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
282290
283291 } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
284292 // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
285- marker_modified = " <fake_token_around_image><global-img>" + ctx->image_marker + " <fake_token_around_image>" ;
286- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
293+ marker_modified = " <fake_token_around_image><global-img>" + ctx->media_marker + " <fake_token_around_image>" ;
294+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
287295
288296 } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
289297 // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
290- marker_modified = ctx->image_marker + " [IMG_END]" ;
291- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
298+ marker_modified = ctx->media_marker + " [IMG_END]" ;
299+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
292300
293301 } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
294302 // <|vision_start|> ... (image embeddings) ... <|vision_end|>
295- marker_modified = " <|vision_start|>" + ctx->image_marker + " <|vision_end|>" ;
296- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
303+ marker_modified = " <|vision_start|>" + ctx->media_marker + " <|vision_end|>" ;
304+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
297305
298306 } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
299307 // (more details in mtmd_context constructor)
300- marker_modified = " <|image_start|>" + ctx->image_marker + " <|image_end|>" ;
301- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
308+ marker_modified = " <|image_start|>" + ctx->media_marker + " <|image_end|>" ;
309+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
302310
303311 } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
304312 // <img> ... (image embeddings) ... </img>
305- marker_modified = " <img>" + ctx->image_marker + " </img>" ;
306- string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
313+ marker_modified = " <img>" + ctx->media_marker + " </img>" ;
314+ string_replace_all (prompt_modified, ctx->media_marker , marker_modified);
307315
308316 }
309317
310318 // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
311319 // for glm-edge, BOI and EOI token's embeddings are not present in the text model
312320
313- std::vector<std::string> parts = string_split_str (prompt_modified, ctx->image_marker );
321+ std::vector<std::string> parts = string_split_str (prompt_modified, ctx->media_marker );
314322 output->entries .clear ();
315323 output->entries .reserve (parts.size ());
316324
@@ -820,6 +828,15 @@ llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
820828 }
821829}
822830
831+ const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk) {
832+ if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
833+ return chunk->tokens_image ->id .c_str ();
834+ } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
835+ return chunk->tokens_audio ->id .c_str ();
836+ }
837+ return nullptr ;
838+ }
839+
823840mtmd_input_chunk * mtmd_input_chunk_copy (const mtmd_input_chunk * chunk) {
824841 mtmd_input_chunk * copy = new mtmd_input_chunk{
825842 chunk->type ,
0 commit comments