@@ -97,6 +97,8 @@ struct mtmd_context {
9797 bool print_timings;
9898 int n_threads;
9999 std::string image_marker;
100+ bool has_vision;
101+ bool has_audio;
100102
101103 // for llava-uhd style models, we need special tokens in-between slices
102104 // minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -135,7 +137,9 @@ struct mtmd_context {
135137 throw std::runtime_error (string_format (" Failed to load CLIP model from %s\n " , mmproj_fname));
136138 }
137139
138- use_mrope = clip_is_qwen2vl (ctx_clip);
140+ has_vision = clip_has_vision_encoder (ctx_clip);
141+ has_audio = clip_has_audio_encoder (ctx_clip);
142+ use_mrope = clip_is_qwen2vl (ctx_clip);
139143
140144 projector_type proj = clip_get_projector_type (ctx_clip);
141145 int minicpmv_version = clip_is_minicpmv (ctx_clip);
@@ -362,15 +366,24 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
362366 output->entries .emplace_back (std::move (chunk));
363367
364368 // only add image/audio tokens to middle of 2 parts
365- bool is_not_last = &parts.back () != ∂
369+ // therefore, we skip handling image/audio if this is the last part
370+ if (&parts.back () == &part) {
371+ continue ;
372+ }
373+
374+ if (!bitmaps[i_bm]->is_audio ) {
375+ // handle image
366376
367- // handle image
368- if (is_not_last && !bitmaps[i_bm]->is_audio ) {
369377 if (i_bm >= n_bitmaps) {
370378 LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
371379 return 1 ;
372380 }
373381
382+ if (!ctx->has_vision ) {
383+ LOG_ERR (" %s: error: model does not support vision input\n " , __func__);
384+ return 2 ;
385+ }
386+
374387 // convert mtmd_bitmap to clip_image_u8
375388 clip_image_u8_ptr img_u8 (clip_image_u8_init ());
376389 img_u8->nx = bitmaps[i_bm]->nx ;
@@ -486,15 +499,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
486499
487500 i_bm++; // move to next image
488501 continue ;
489- }
490-
491- // handle audio
492- if (is_not_last && bitmaps[i_bm]-> is_audio ) {
502+
503+ } else {
504+ // handle audio
505+
493506 if (i_bm >= n_bitmaps) {
494507 LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
495508 return 1 ;
496509 }
497510
511+ if (!ctx->has_audio ) {
512+ LOG_ERR (" %s: error: model does not support audio input\n " , __func__);
513+ return 2 ;
514+ }
515+
498516 // preprocess audio
499517 whisper_preprocessor::whisper_mel mel_spec;
500518 GGML_ASSERT (ctx->w_filters .n_mel );
@@ -506,9 +524,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
506524 return 2 ;
507525 }
508526
509- // DEBUG!!!!!!!!!!
510- printf (" mel_spec.n_len = %d\n " , mel_spec.n_len );
511- printf (" mel_spec.n_mel = %d\n " , mel_spec.n_mel );
527+ // DEBUG!!!
528+ // mel_spec.data.resize(220*8*2 * mel_spec.n_mel);
529+ // mel_spec.n_len = 220*8*2;
530+ LOG_DBG (" mel_spec.n_len = %d\n " , mel_spec.n_len );
531+ LOG_DBG (" mel_spec.n_mel = %d\n " , mel_spec.n_mel );
512532
513533 // convert mel spectrogram to clip_image_f32_batch
514534 clip_image_f32_ptr mel_f32 (clip_image_f32_init ());
@@ -526,6 +546,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
526546 audio_tokens->batch_f32 = std::move (batch_f32);
527547 audio_tokens->id = bitmaps[i_bm]->id ; // optional
528548
549+ LOG_DBG (" audio_tokens->n_tokens = %d\n " , audio_tokens->n_tokens );
550+
529551 mtmd_input_chunk chunk{
530552 MTMD_INPUT_CHUNK_TYPE_AUDIO,
531553 {}, // text tokens
@@ -606,6 +628,14 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
606628 return ctx->use_mrope ;
607629}
608630
631+ bool mtmd_support_vision (mtmd_context * ctx) {
632+ return ctx->has_vision ;
633+ }
634+
635+ bool mtmd_support_audio (mtmd_context * ctx) {
636+ return ctx->has_audio ;
637+ }
638+
609639// these 2 helpers below use internal clip_image_u8_ptr,
610640// so unfortunately they cannot moved to mtmd-helper.h
611641// however, in theory, user can decode image file to bitmap using
0 commit comments