Nexesenex
diff --git a/‎.editorconfig‎
Lines changed: 4 additions & 0 deletions b/‎.editorconfig‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions b/‎common/arg.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 1 deletion b/‎common/common.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 118 additions & 42 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 118 additions & 42 deletions
diff --git a/‎docs/multimodal.md‎
Lines changed: 13 additions & 1 deletion b/‎docs/multimodal.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 2 additions & 2 deletions b/‎ggml/include/ggml.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-vec-f16.cuh‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-cuda/fattn-vec-f16.cuh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/fattn-vec-f32.cuh‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-cuda/fattn-vec-f32.cuh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 4 additions & 0 deletions b/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 4 additions & 0 deletions
@@ -48,3 +48,7 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[tools/mtmd/miniaudio.h]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
@@ -26,7 +26,7 @@ It offers the following functionalities:
 ![418190669-f839ed18-0fb9-4319-82d8-03952c5d3f20](https://github.com/user-attachments/assets/2ee0c202-8f68-44d6-aa6f-18ae35f33857)
 
 - Agent thinking (based on prompts from this cool project [here](https://github.com/Wladastic/mini_autogpt))
-- Improvements to TextDB, such as VectorDB (embedding) support and document support (including upload of text documents, PDFs (SevenOf9 wrote the parser), OCR using the vision model loaded, and transcription from audio)
+- Improvements to TextDB, such as VectorDB (embedding) support and document support (including upload of text documents, PDFs (Vic49 / SevenOf9 wrote the parser), OCR using the vision model loaded, and transcription from audio)
 - Export / Import of WI groups from files
 
 ![8d3e3a9d-08e0-4caf-9ad8-c2d806fec7e0](https://github.com/user-attachments/assets/8d3e3a9d-08e0-4caf-9ad8-c2d806fec7e0)
@@ -87,7 +87,7 @@ Using this function requires the following conditions to be met:
 
 ![image](https://github.com/user-attachments/assets/41ec4f1c-5698-4ef3-ba7c-6998cbc1d8f3)
 
-- Upload document support (including upload of text documents, lorebooks, PDFs (SevenOf9 wrote the parser), OCR using the vision model loaded, and transcription from audio)
+- Upload document support (including upload of text documents, lorebooks, PDFs (Vic49 / SevenOf9 wrote the parser), OCR using the vision model loaded, and transcription from audio)
 - Export / Import of WI groups from files
 
 ## Running the fork
 
@@ -40,7 +40,7 @@
 using json = nlohmann::ordered_json;
 
 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
 };
 
@@ -2234,12 +2234,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
     add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
@@ -2869,7 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
 
@@ -72,7 +72,7 @@ enum llama_example {
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
 
@@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
 
-To enable it, can use use one of the 2 methods below:
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+
+To enable it, you can use one of the 2 methods below:
 
 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
     - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,
 
 NOTE: some models may require large context window, for example: `-c 8192`
 
+**Vision models**:
+
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
 ```
+
+**Audio models**:
+
+```sh
+# Ultravox 0.5
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
+```
@@ -534,15 +534,15 @@ extern "C" {
         GGML_UNARY_OP_STEP,
         GGML_UNARY_OP_TANH,
         GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
         GGML_UNARY_OP_SIGMOID,
         GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_GELU_ERF,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
         GGML_UNARY_OP_HARDSWISH,
         GGML_UNARY_OP_HARDSIGMOID,
         GGML_UNARY_OP_EXP,
-        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU_ERF,
 
         GGML_UNARY_OP_COUNT,
     };
 
@@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16(
                 }
             }
             if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
                 continue;
             }
 #endif // GGML_USE_HIP
 
@@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32(
                 }
             }
             if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
                 continue;
             }
 #endif // GGML_USE_HIP
 
@@ -2182,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_SILU:
                     ggml_cuda_op_silu(ctx, dst);
                     break;
+                case GGML_UNARY_OP_GELU_ERF:
+                    ggml_cuda_op_gelu_erf(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_GELU_QUICK:
                     ggml_cuda_op_gelu_quick(ctx, dst);
                     break;
@@ -2967,6 +2970,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
Original file line number	Diff line number	Diff line change
`@@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16(`
`212`	`212`	`}`
`213`	`213`	`}`
`214`	`214`	`if (__all_sync(0xFFFFFFFF, skip)) {`
	`215`	`+ __syncthreads();`
`215`	`216`	`continue;`
`216`	`217`	`}`
`217`	`218`	`#endif // GGML_USE_HIP`
Original file line number	Diff line number	Diff line change
`@@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32(`
`217`	`217`	`}`
`218`	`218`	`}`
`219`	`219`	`if (__all_sync(0xFFFFFFFF, skip)) {`
	`220`	`+ __syncthreads();`
`220`	`221`	`continue;`
`221`	`222`	`}`
`222`	`223`	`#endif // GGML_USE_HIP`