@@ -157,18 +157,26 @@ struct mtmd_context {
157157 throw std::runtime_error (string_format (" Failed to load CLIP model from %s\n " , mmproj_fname));
158158 }
159159
160- clip_ctx * ctx_clip = get_clip_ctx ();
161- if (llama_model_n_embd (text_model) != clip_n_mmproj_embd (ctx_clip)) {
160+ if (llama_model_n_embd (text_model) != n_embd_projected ()) {
162161 throw std::runtime_error (string_format (
163162 " mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n "
164163 " hint: you may be using wrong mmproj\n " ,
165- llama_model_n_embd (text_model), clip_n_mmproj_embd (ctx_clip )));
164+ llama_model_n_embd (text_model), n_embd_projected ( )));
166165 }
166+ if (ctx_v) {
167+ init_vision ();
168+ }
169+ if (ctx_a) {
170+ init_audio ();
171+ }
172+ }
167173
168- use_mrope = clip_is_qwen2vl (ctx_clip);
174+ void init_vision () {
175+ GGML_ASSERT (ctx_v != nullptr );
176+ use_mrope = clip_is_qwen2vl (ctx_v);
169177
170- projector_type proj = clip_get_projector_type (ctx_clip );
171- int minicpmv_version = clip_is_minicpmv (ctx_clip );
178+ projector_type proj = clip_get_projector_type (ctx_v );
179+ int minicpmv_version = clip_is_minicpmv (ctx_v );
172180 if (minicpmv_version == 2 ) {
173181 // minicpmv 2.5 format:
174182 // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -219,57 +227,53 @@ struct mtmd_context {
219227 }
220228
221229 // set boi/eoi
222- projector_type pt = proj_type ();
223- if (pt == PROJECTOR_TYPE_GEMMA3) {
230+ if (proj == PROJECTOR_TYPE_GEMMA3) {
224231 // <start_of_image> ... (image embeddings) ... <end_of_image>
225232 img_beg = " <start_of_image>" ;
226233 img_end = " <end_of_image>" ;
227234
228- } else if (pt == PROJECTOR_TYPE_IDEFICS3) {
235+ } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
229236 // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
230237 img_beg = " <fake_token_around_image><global-img>" ;
231238 img_end = " <fake_token_around_image>" ;
232239
233- } else if (pt == PROJECTOR_TYPE_PIXTRAL) {
240+ } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
234241 // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
235242 img_end = " [IMG_END]" ;
236243
237- } else if (pt == PROJECTOR_TYPE_QWEN2VL || pt == PROJECTOR_TYPE_QWEN25VL) {
244+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
238245 // <|vision_start|> ... (image embeddings) ... <|vision_end|>
239246 img_beg = " <|vision_start|>" ;
240247 img_end = " <|vision_end|>" ;
241248
242- } else if (pt == PROJECTOR_TYPE_LLAMA4) {
249+ } else if (proj == PROJECTOR_TYPE_LLAMA4) {
243250 // (more details in mtmd_context constructor)
244251 img_beg = " <|image_start|>" ;
245252 img_end = " <|image_end|>" ;
253+ LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
254+ " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
246255
247- } else if (pt == PROJECTOR_TYPE_INTERNVL) {
256+ } else if (proj == PROJECTOR_TYPE_INTERNVL) {
248257 // <img> ... (image embeddings) ... </img>
249258 img_beg = " <img>" ;
250259 img_end = " </img>" ;
251260
252- } else if (pt == PROJECTOR_TYPE_QWEN2A) {
261+ }
262+ }
263+
264+ void init_audio () {
265+ GGML_ASSERT (ctx_a != nullptr );
266+ projector_type proj = clip_get_projector_type (ctx_a);
267+
268+ LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
269+ " https://github.com/ggml-org/llama.cpp/discussions/13759\n " , __func__);
270+
271+ if (proj == PROJECTOR_TYPE_QWEN2A) {
253272 // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
254273 aud_beg = " <|audio_bos|>" ;
255274 aud_end = " <|audio_eos|>" ;
256275
257276 }
258-
259- // warning messages
260- if (proj == PROJECTOR_TYPE_LLAMA4) {
261- LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
262- " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
263- }
264- if (ctx_a) {
265- LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
266- " https://github.com/ggml-org/llama.cpp/discussions/13759\n " , __func__);
267- }
268- }
269-
270- // get the main clip ctx
271- clip_ctx * get_clip_ctx () const {
272- return ctx_v ? ctx_v : ctx_a;
273277 }
274278
275279 // get clip ctx based on chunk type
@@ -282,14 +286,17 @@ struct mtmd_context {
282286 GGML_ABORT (" unknown chunk type" );
283287 }
284288
285- // both audio and vision contexts have the same projector type
286- projector_type proj_type () const {
287- return clip_get_projector_type (get_clip_ctx ());
289+ projector_type proj_type_v () const {
290+ return ctx_v ? clip_get_projector_type (ctx_v) : PROJECTOR_TYPE_UNKNOWN;
291+ }
292+
293+ projector_type proj_type_a () const {
294+ return ctx_a ? clip_get_projector_type (ctx_a) : PROJECTOR_TYPE_UNKNOWN;
288295 }
289296
290297 // both audio and vision contexts have the n_embd output dimension
291298 int n_embd_projected () const {
292- return clip_n_mmproj_embd (get_clip_ctx () );
299+ return clip_n_mmproj_embd (ctx_v ? ctx_v : ctx_a );
293300 }
294301
295302 ~mtmd_context () {
@@ -400,6 +407,7 @@ struct mtmd_tokenizer {
400407 }
401408
402409 void add_text (const std::string & txt, bool add_special, bool parse_special) {
410+ LOG_DBG (" %s: %s\n " , __func__, txt.c_str ());
403411 auto tokens = mtmd_tokenize_text_internal (vocab, txt, add_special, parse_special);
404412 add_text (tokens);
405413 }
@@ -434,7 +442,9 @@ struct mtmd_tokenizer {
434442 return 2 ;
435443 }
436444
437- add_text (ctx->img_beg , false , true ); // add image begin token
445+ if (!ctx->img_beg .empty ()) {
446+ add_text (ctx->img_beg , false , true ); // add image begin token
447+ }
438448
439449 // convert mtmd_bitmap to clip_image_u8
440450 clip_image_u8_ptr img_u8 (clip_image_u8_init ());
@@ -549,7 +559,9 @@ struct mtmd_tokenizer {
549559 cur.entries .emplace_back (std::move (chunk));
550560 }
551561
552- add_text (ctx->img_end , false , true ); // add image end token
562+ if (!ctx->img_end .empty ()) {
563+ add_text (ctx->img_end , false , true ); // add image end token
564+ }
553565
554566 } else {
555567 // handle audio
@@ -564,7 +576,9 @@ struct mtmd_tokenizer {
564576 return 2 ;
565577 }
566578
567- add_text (ctx->aud_beg , false , true ); // add audio begin token
579+ if (!ctx->aud_beg .empty ()) {
580+ add_text (ctx->aud_beg , false , true ); // add audio begin token
581+ }
568582
569583 // preprocess audio
570584 GGML_ASSERT (ctx->w_filters .n_mel ); // make sure we have filter preloaded
@@ -606,7 +620,9 @@ struct mtmd_tokenizer {
606620 cur.entries .emplace_back (std::move (chunk));
607621 }
608622
609- add_text (ctx->aud_end , false , true ); // add audio end token
623+ if (!ctx->aud_end .empty ()) {
624+ add_text (ctx->aud_end , false , true ); // add audio end token
625+ }
610626 }
611627
612628 return 0 ;
@@ -751,7 +767,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
751767}
752768
753769bool mtmd_decode_use_non_causal (mtmd_context * ctx) {
754- if (ctx->proj_type ( ) == PROJECTOR_TYPE_GEMMA3) {
770+ if (ctx->ctx_v && clip_get_projector_type (ctx-> ctx_v ) == PROJECTOR_TYPE_GEMMA3) {
755771 return true ;
756772 }
757773 return false ;
0 commit comments