final touch UX

ngxson · ngxson · commit e6416b0d3cf2 · 2025-05-21T18:30:08.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -309,6 +309,7 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.POSNET_NORM1,
                             gguf.MODEL_TENSOR.POSNET_NORM2,
                             gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                         )
                     )
                     or not new_name.endswith(".weight")
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -4,6 +4,8 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
 
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+
 To enable it, can use use one of the 2 methods below:
 
 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
@@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,
 
 NOTE: some models may require large context window, for example: `-c 8192`
 
+**Vision models**:
+
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
 ```
+
+**Audio models**:
+
+```sh
+# Ultravox 0.5
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
+```
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -2199,10 +2199,6 @@ struct clip_model_loader {
             LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
-
-            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
-                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-            }
         }
     }
 
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -37,10 +37,10 @@ static volatile bool g_is_interrupted = false;
 static void show_additional_info(int /*argc*/, char ** argv) {
     LOG(
         "Experimental CLI for multimodal\n\n"
-        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
+        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
         "  -m and --mmproj are required\n"
         "  -hf user/repo can replace both -m and --mmproj in most cases\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+        "  --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
         "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
         argv[0]
     );
@@ -142,7 +142,7 @@ struct mtmd_cli_context {
         );
     }
 
-    bool load_image(const std::string & fname) {
+    bool load_media(const std::string & fname) {
         mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
         if (!bmp.ptr) {
             return false;
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
         msg.role = "user";
         msg.content = params.prompt;
         for (const auto & image : params.image) {
-            if (!ctx.load_image(image)) {
+            if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
         }
@@ -303,7 +303,12 @@ int main(int argc, char ** argv) {
 
     } else {
         LOG("\n Running in chat mode, available commands:");
-        LOG("\n   /image <path>    load an image");
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /image <path>    load an image");
+        }
+        if (mtmd_support_audio(ctx.ctx_vision.get())) {
+            LOG("\n   /audio <path>    load an audio");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -333,14 +338,16 @@ int main(int argc, char ** argv) {
                 continue;
             }
             g_is_generating = true;
-            if (line == "/image" || line.find("/image ") == 0) {
+            bool is_image = line == "/image" || line.find("/image ") == 0;
+            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
+            if (is_image || is_audio) {
                 if (line.size() < 8) {
-                    LOG_ERR("ERR: Missing image filename\n");
+                    LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
-                std::string image = line.substr(7);
-                if (ctx.load_image(image)) {
-                    LOG("Image %s loaded\n", image.c_str());
+                std::string media_path = line.substr(7);
+                if (ctx.load_media(media_path)) {
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
                     content += MTMD_DEFAULT_MEDIA_MARKER;
                 }
                 // else, error is already printed by libmtmd
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -108,9 +108,9 @@ struct mtmd_context {
     llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
     llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
     llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
-    llama_token tok_sli_bm_start = LLAMA_TOKEN_NULL; // single slice start
-    llama_token tok_sli_bm_end   = LLAMA_TOKEN_NULL; // single slice end
-    llama_token tok_sli_bm_mid   = LLAMA_TOKEN_NULL; // between 2 slices
+    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
+    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice end
+    llama_token tok_sli_img_mid   = LLAMA_TOKEN_NULL; // between 2 slices
     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
     bool        tok_row_end_trail = false;
     bool        ov_img_first      = false;
@@ -156,8 +156,8 @@ struct mtmd_context {
             tok_ov_img_end    = lookup_token("</image>");
             tok_slices_start  = lookup_token("<slice>");
             tok_slices_end    = lookup_token("</slice>");
-            tok_sli_bm_start = tok_ov_img_start;
-            tok_sli_bm_end   = tok_ov_img_end;
+            tok_sli_img_start = tok_ov_img_start;
+            tok_sli_img_end   = tok_ov_img_end;
             tok_row_end       = lookup_token("\n");
             tok_row_end_trail = false; // no trailing end-of-row token
             ov_img_first      = true;
@@ -168,8 +168,8 @@ struct mtmd_context {
             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
             tok_ov_img_start  = lookup_token("<image>");
             tok_ov_img_end    = lookup_token("</image>");
-            tok_sli_bm_start = lookup_token("<slice>");
-            tok_sli_bm_end   = lookup_token("</slice>");
+            tok_sli_img_start = lookup_token("<slice>");
+            tok_sli_img_end   = lookup_token("</slice>");
             tok_row_end       = lookup_token("\n");
             tok_row_end_trail = false; // no trailing end-of-row token
             ov_img_first      = true;
@@ -186,7 +186,7 @@ struct mtmd_context {
             // <|image_end|>
             slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
             tok_ov_img_start  = lookup_token("<|image|>");
-            tok_sli_bm_mid   = lookup_token("<|tile_x_separator|>");
+            tok_sli_img_mid   = lookup_token("<|tile_x_separator|>");
             tok_row_end       = lookup_token("<|tile_y_separator|>");
             tok_row_end_trail = true; // add trailing end-of-row token
             ov_img_first      = false; // overview image is last
@@ -196,6 +196,16 @@ struct mtmd_context {
             // TODO @ngxson : check if model n_mel is 128 or 80
             w_filters = whisper_precalc_filters::get_128_bins();
         }
+
+        // warning messages
+        if (proj == PROJECTOR_TYPE_LLAMA4) {
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+        }
+        if (has_audio) {
+            LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
+        }
     }
 
     ~mtmd_context() {
@@ -441,15 +451,15 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     for (int y = 0; y < n_row; y++) {
                         for (int x = 0; x < n_col; x++) {
                             const bool is_last_in_row = (x == n_col - 1);
-                            if (ctx->tok_sli_bm_start != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_bm_start});
+                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_start});
                             }
                             output->entries.emplace_back(std::move(chunks[y * n_col + x]));
-                            if (ctx->tok_sli_bm_end != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_bm_end});
+                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_end});
                             }
-                            if (!is_last_in_row && ctx->tok_sli_bm_mid != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_bm_mid});
+                            if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_mid});
                             }
                         }
                         if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {

Original file line number	Diff line number	Diff line change
`@@ -309,6 +309,7 @@ def prepare_tensors(self):`
`309`	`309`	`gguf.MODEL_TENSOR.POSNET_NORM1,`
`310`	`310`	`gguf.MODEL_TENSOR.POSNET_NORM2,`
`311`	`311`	`gguf.MODEL_TENSOR.V_ENC_EMBD_POS,`
	`312`	`+ gguf.MODEL_TENSOR.A_ENC_EMBD_POS,`
`312`	`313`	`)`
`313`	`314`	`)`
`314`	`315`	`or not new_name.endswith(".weight")`
Original file line number	Diff line number	Diff line change
`@@ -2199,10 +2199,6 @@ struct clip_model_loader {`
`2199`	`2199`	`LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());`
`2200`	`2200`	`LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);`
`2201`	`2201`	`LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);`
`2202`		`-`
`2203`		`- if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {`
`2204`		`- LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);`
`2205`		`- }`
`2206`	`2202`	`}`
`2207`	`2203`	`}`
`2208`	`2204`