@@ -108,9 +108,9 @@ struct mtmd_context {
108108 llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
109109 llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
110110 llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
111- llama_token tok_sli_bm_start = LLAMA_TOKEN_NULL; // single slice start
112- llama_token tok_sli_bm_end = LLAMA_TOKEN_NULL; // single slice end
113- llama_token tok_sli_bm_mid = LLAMA_TOKEN_NULL; // between 2 slices
111+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
112+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
113+ llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
114114 llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
115115 bool tok_row_end_trail = false ;
116116 bool ov_img_first = false ;
@@ -156,8 +156,8 @@ struct mtmd_context {
156156 tok_ov_img_end = lookup_token (" </image>" );
157157 tok_slices_start = lookup_token (" <slice>" );
158158 tok_slices_end = lookup_token (" </slice>" );
159- tok_sli_bm_start = tok_ov_img_start;
160- tok_sli_bm_end = tok_ov_img_end;
159+ tok_sli_img_start = tok_ov_img_start;
160+ tok_sli_img_end = tok_ov_img_end;
161161 tok_row_end = lookup_token (" \n " );
162162 tok_row_end_trail = false ; // no trailing end-of-row token
163163 ov_img_first = true ;
@@ -168,8 +168,8 @@ struct mtmd_context {
168168 slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
169169 tok_ov_img_start = lookup_token (" <image>" );
170170 tok_ov_img_end = lookup_token (" </image>" );
171- tok_sli_bm_start = lookup_token (" <slice>" );
172- tok_sli_bm_end = lookup_token (" </slice>" );
171+ tok_sli_img_start = lookup_token (" <slice>" );
172+ tok_sli_img_end = lookup_token (" </slice>" );
173173 tok_row_end = lookup_token (" \n " );
174174 tok_row_end_trail = false ; // no trailing end-of-row token
175175 ov_img_first = true ;
@@ -186,7 +186,7 @@ struct mtmd_context {
186186 // <|image_end|>
187187 slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
188188 tok_ov_img_start = lookup_token (" <|image|>" );
189- tok_sli_bm_mid = lookup_token (" <|tile_x_separator|>" );
189+ tok_sli_img_mid = lookup_token (" <|tile_x_separator|>" );
190190 tok_row_end = lookup_token (" <|tile_y_separator|>" );
191191 tok_row_end_trail = true ; // add trailing end-of-row token
192192 ov_img_first = false ; // overview image is last
@@ -196,6 +196,16 @@ struct mtmd_context {
196196 // TODO @ngxson : check if model n_mel is 128 or 80
197197 w_filters = whisper_precalc_filters::get_128_bins ();
198198 }
199+
200+ // warning messages
201+ if (proj == PROJECTOR_TYPE_LLAMA4) {
202+ LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
203+ " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
204+ }
205+ if (has_audio) {
206+ LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
207+ " https://github.com/ggml-org/llama.cpp/pull/13623\n " , __func__);
208+ }
199209 }
200210
201211 ~mtmd_context () {
@@ -441,15 +451,15 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
441451 for (int y = 0 ; y < n_row; y++) {
442452 for (int x = 0 ; x < n_col; x++) {
443453 const bool is_last_in_row = (x == n_col - 1 );
444- if (ctx->tok_sli_bm_start != LLAMA_TOKEN_NULL) {
445- add_text_chunk ({ctx->tok_sli_bm_start });
454+ if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
455+ add_text_chunk ({ctx->tok_sli_img_start });
446456 }
447457 output->entries .emplace_back (std::move (chunks[y * n_col + x]));
448- if (ctx->tok_sli_bm_end != LLAMA_TOKEN_NULL) {
449- add_text_chunk ({ctx->tok_sli_bm_end });
458+ if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
459+ add_text_chunk ({ctx->tok_sli_img_end });
450460 }
451- if (!is_last_in_row && ctx->tok_sli_bm_mid != LLAMA_TOKEN_NULL) {
452- add_text_chunk ({ctx->tok_sli_bm_mid });
461+ if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
462+ add_text_chunk ({ctx->tok_sli_img_mid });
453463 }
454464 }
455465 if ((y != n_row - 1 || ctx->tok_row_end_trail ) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
0 commit comments