Thireus
diff --git a/‎common/chat.cpp‎
Lines changed: 2 additions & 1 deletion b/‎common/chat.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/chat.h‎
Lines changed: 2 additions & 1 deletion b/‎common/chat.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/common.cpp‎
Lines changed: 10 additions & 1 deletion b/‎common/common.cpp‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 35 additions & 2 deletions b/‎common/common.h‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/main/main.cpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/main/main.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/mtmd/CMakeLists.txt‎
Lines changed: 62 additions & 0 deletions b/‎examples/mtmd/CMakeLists.txt‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎examples/mtmd/README.md‎
Lines changed: 63 additions & 0 deletions b/‎examples/mtmd/README.md‎
Lines changed: 63 additions & 0 deletions
@@ -489,11 +489,12 @@ std::string common_chat_format_single(
     return ss.str();
 }
 
-std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
     common_chat_templates_inputs inputs;
     inputs.use_jinja = use_jinja;
     inputs.add_bos = tmpls->add_bos;
     inputs.add_eos = tmpls->add_eos;
+    inputs.chat_template_kwargs = chat_template_kwargs;
     auto add_simple_msg = [&](auto role, auto content) {
         common_chat_msg msg;
         msg.role = role;
 
@@ -188,7 +188,8 @@ std::string common_chat_format_single(
 // Returns an example of formatted chat
 std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
-    bool use_jinja);
+    bool use_jinja,
+    const std::map<std::string, std::string> & chat_template_kwargs);
 
 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 
@@ -899,7 +899,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "--mmproj") {
         CHECK_ARG
-        params.mmproj = argv[i];
+        params.mmproj.path = argv[i];
+        return true;
+    }
+    if (arg == "--mmproj-url") {
+        CHECK_ARG
+        params.mmproj.url = argv[i];
+        return true;
+    }
+    if (arg == "--no-mmproj-offload") {
+        params.mmproj_use_gpu = false;
         return true;
     }
     if (arg == "--image") {
 
@@ -68,6 +68,29 @@ struct llama_control_vector_load_info;
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
 
+enum llama_example {
+    LLAMA_EXAMPLE_COMMON,
+    LLAMA_EXAMPLE_SPECULATIVE,
+    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_EMBEDDING,
+    LLAMA_EXAMPLE_PERPLEXITY,
+    LLAMA_EXAMPLE_RETRIEVAL,
+    LLAMA_EXAMPLE_PASSKEY,
+    LLAMA_EXAMPLE_IMATRIX,
+    LLAMA_EXAMPLE_BENCH,
+    LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
+    LLAMA_EXAMPLE_EXPORT_LORA,
+    LLAMA_EXAMPLE_MTMD,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_DIFFUSION,
+    LLAMA_EXAMPLE_FINETUNE,
+
+    LLAMA_EXAMPLE_COUNT,
+};
+
 //
 // CLI argument parsing
 //
@@ -86,6 +109,14 @@ enum common_reasoning_format {
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
+struct model_paths {
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+};
+
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
@@ -230,8 +261,10 @@ struct gpt_params {
     std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
     std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model
 
-    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    // multimodal models (see examples/mtmd)
+    model_paths mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
 
@@ -29,7 +29,6 @@ else()
     add_subdirectory(imatrix)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
-    add_subdirectory(llava)
     add_subdirectory(lookahead)
     add_subdirectory(lookup)
     add_subdirectory(main)
@@ -39,6 +38,7 @@ else()
     add_subdirectory(quantize-stats)
     add_subdirectory(quantize)
     add_subdirectory(retrieval)
+    add_subdirectory(mtmd)
     if (GGML_RPC)
         add_subdirectory(rpc)
     endif()
 
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
     if (params.conversation) {
         if (params.enable_chat_template) {
             //LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
-            LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
+            LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, {}).c_str());
         } else {
             LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
 
@@ -0,0 +1,62 @@
+# mtmd
+
+find_package(Threads REQUIRED)
+
+add_library(mtmd
+            mtmd.cpp
+            mtmd-audio.cpp
+            mtmd.h
+            clip.cpp
+            clip.h
+            clip-impl.h
+            mtmd-helper.cpp
+            mtmd-helper.h
+            )
+
+target_link_libraries     (mtmd PUBLIC ggml llama)
+target_link_libraries     (mtmd PRIVATE Threads::Threads)
+target_include_directories(mtmd PUBLIC  .)
+target_include_directories(mtmd PRIVATE ../..)
+target_include_directories(mtmd PRIVATE ../../vendor)
+target_compile_features   (mtmd PRIVATE cxx_std_17)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
+    target_compile_definitions(mtmd PUBLIC  LLAMA_SHARED)
+endif()
+
+set(MTMD_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    )
+
+set_target_properties(mtmd
+    PROPERTIES
+    PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
+
+install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
+
+if (NOT MSVC)
+    # for stb_image.h and miniaudio.h
+    target_compile_options(mtmd PRIVATE -Wno-cast-qual)
+endif()
+
+if (TARGET BUILD_INFO)
+    add_dependencies(mtmd        BUILD_INFO)
+    add_dependencies(mtmd-helper BUILD_INFO)
+endif()
+
+add_executable(llama-llava-cli    deprecation-warning.cpp)
+add_executable(llama-gemma3-cli   deprecation-warning.cpp)
+add_executable(llama-minicpmv-cli deprecation-warning.cpp)
+add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
+
+set(TARGET llama-mtmd-cli)
+add_executable         (${TARGET} mtmd-cli.cpp)
+set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
+target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -0,0 +1,63 @@
+# Multimodal Support in llama.cpp
+
+This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
+
+> [!IMPORTANT]
+>
+> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
+
+The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
+
+- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
+- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
+- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
+- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
+- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
+
+## Pre-quantized models
+
+See the list of pre-quantized model [here](../../docs/multimodal.md)
+
+## How it works and what is `mmproj`?
+
+Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
+
+This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
+
+Consequently, running a multimodal model typically requires two GGUF files:
+1.  The standard language model file.
+2.  A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
+
+## What is `libmtmd`?
+
+As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
+
+Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
+- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
+- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
+- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
+
+## How to obtain `mmproj`
+
+Multimodal projector (`mmproj`) files are specific to each model architecture.
+
+For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
+- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
+- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
+- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
+- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
+- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
+
+For older models, please refer to the relevant guide for instructions on how to obtain or create them:
+
+NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
+
+- [LLaVA](../../docs/multimodal/llava.md)
+- [MobileVLM](../../docs/multimodal/MobileVLM.md)
+- [GLM-Edge](../../docs/multimodal/glmedge.md)
+- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
+- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
+- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
+- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {`
`233`	`233`	`if (params.conversation) {`
`234`	`234`	`if (params.enable_chat_template) {`
`235`	`235`	`//LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());`
`236`		`- LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());`
	`236`	`+ LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, {}).c_str());`
`237`	`237`	`} else {`
`238`	`238`	`LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);`
`239`	`239`	`}`