Skip to content

Commit 87e4762

Browse files
ikawrakowIwan Kawrakow
andauthored
Port mdmd from mainline + Qwen2/2.5-VL support (ikawrakow#798)
* Add mtmd: the beginning * Add mtmd: mtmd.cpp compiles * Add mtmd: clip initialization compiles * Add mtmd: clip.cpp compiles * Add mtmd: builds successfully * Add CPU implementation for GGML_OP_GLU * Add CUDA implementation for GGML_OP_GLU * Add CPU implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW * Add CUDA implementation for GGML_OP_CONV_2D and GGML_OP_CONV_2D_DW * Add mtmd: refresh CPU rope * Add mtmd: refresh CUDA rope * Add mtmd: add Qwen2-VL * Add mtmd: Qwen2.5-VL text seems to work with this change * Add mtmd: fix swiglu * Add mtmd: use LOG_TEE so generated tokens show up in terminal * Add mtmd: do not attempt to load a GPU backend if none are available * GLU, not GPU * Fix typo * Fix new/free mismatch * LOG stuff * Add mtmd: this fixes gibberish on second image --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 367654f commit 87e4762

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+115261
-552
lines changed

common/chat.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,11 +489,12 @@ std::string common_chat_format_single(
489489
return ss.str();
490490
}
491491

492-
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
492+
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
493493
common_chat_templates_inputs inputs;
494494
inputs.use_jinja = use_jinja;
495495
inputs.add_bos = tmpls->add_bos;
496496
inputs.add_eos = tmpls->add_eos;
497+
inputs.chat_template_kwargs = chat_template_kwargs;
497498
auto add_simple_msg = [&](auto role, auto content) {
498499
common_chat_msg msg;
499500
msg.role = role;

common/chat.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,8 @@ std::string common_chat_format_single(
188188
// Returns an example of formatted chat
189189
std::string common_chat_format_example(
190190
const struct common_chat_templates * tmpls,
191-
bool use_jinja);
191+
bool use_jinja,
192+
const std::map<std::string, std::string> & chat_template_kwargs);
192193

193194
const char* common_chat_format_name(common_chat_format format);
194195
const char* common_reasoning_format_name(common_reasoning_format format);

common/common.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
899899
}
900900
if (arg == "--mmproj") {
901901
CHECK_ARG
902-
params.mmproj = argv[i];
902+
params.mmproj.path = argv[i];
903+
return true;
904+
}
905+
if (arg == "--mmproj-url") {
906+
CHECK_ARG
907+
params.mmproj.url = argv[i];
908+
return true;
909+
}
910+
if (arg == "--no-mmproj-offload") {
911+
params.mmproj_use_gpu = false;
903912
return true;
904913
}
905914
if (arg == "--image") {

common/common.h

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,29 @@ struct llama_control_vector_load_info;
6868
int32_t cpu_get_num_physical_cores();
6969
int32_t cpu_get_num_math();
7070

71+
enum llama_example {
72+
LLAMA_EXAMPLE_COMMON,
73+
LLAMA_EXAMPLE_SPECULATIVE,
74+
LLAMA_EXAMPLE_MAIN,
75+
LLAMA_EXAMPLE_EMBEDDING,
76+
LLAMA_EXAMPLE_PERPLEXITY,
77+
LLAMA_EXAMPLE_RETRIEVAL,
78+
LLAMA_EXAMPLE_PASSKEY,
79+
LLAMA_EXAMPLE_IMATRIX,
80+
LLAMA_EXAMPLE_BENCH,
81+
LLAMA_EXAMPLE_SERVER,
82+
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
83+
LLAMA_EXAMPLE_EXPORT_LORA,
84+
LLAMA_EXAMPLE_MTMD,
85+
LLAMA_EXAMPLE_LOOKUP,
86+
LLAMA_EXAMPLE_PARALLEL,
87+
LLAMA_EXAMPLE_TTS,
88+
LLAMA_EXAMPLE_DIFFUSION,
89+
LLAMA_EXAMPLE_FINETUNE,
90+
91+
LLAMA_EXAMPLE_COUNT,
92+
};
93+
7194
//
7295
// CLI argument parsing
7396
//
@@ -86,6 +109,14 @@ enum common_reasoning_format {
86109
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
87110
};
88111

112+
struct model_paths {
113+
std::string path = ""; // model local path // NOLINT
114+
std::string url = ""; // model url to download // NOLINT
115+
std::string hf_repo = ""; // HF repo // NOLINT
116+
std::string hf_file = ""; // HF file // NOLINT
117+
std::string docker_repo = ""; // Docker repo // NOLINT
118+
};
119+
89120
struct gpt_params {
90121
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
91122

@@ -230,8 +261,10 @@ struct gpt_params {
230261
std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
231262
std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model
232263

233-
// multimodal models (see examples/llava)
234-
std::string mmproj = ""; // path to multimodal projector
264+
// multimodal models (see examples/mtmd)
265+
model_paths mmproj;
266+
bool mmproj_use_gpu = true; // use GPU for multimodal model
267+
bool no_mmproj = false; // explicitly disable multimodal model
235268
std::vector<std::string> image; // path to image file(s)
236269

237270
// embedding

examples/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ else()
2929
add_subdirectory(imatrix)
3030
add_subdirectory(infill)
3131
add_subdirectory(llama-bench)
32-
add_subdirectory(llava)
3332
add_subdirectory(lookahead)
3433
add_subdirectory(lookup)
3534
add_subdirectory(main)
@@ -39,6 +38,7 @@ else()
3938
add_subdirectory(quantize-stats)
4039
add_subdirectory(quantize)
4140
add_subdirectory(retrieval)
41+
add_subdirectory(mtmd)
4242
if (GGML_RPC)
4343
add_subdirectory(rpc)
4444
endif()

examples/main/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
233233
if (params.conversation) {
234234
if (params.enable_chat_template) {
235235
//LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
236-
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
236+
LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, {}).c_str());
237237
} else {
238238
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
239239
}

examples/mtmd/CMakeLists.txt

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# mtmd
2+
3+
find_package(Threads REQUIRED)
4+
5+
add_library(mtmd
6+
mtmd.cpp
7+
mtmd-audio.cpp
8+
mtmd.h
9+
clip.cpp
10+
clip.h
11+
clip-impl.h
12+
mtmd-helper.cpp
13+
mtmd-helper.h
14+
)
15+
16+
target_link_libraries (mtmd PUBLIC ggml llama)
17+
target_link_libraries (mtmd PRIVATE Threads::Threads)
18+
target_include_directories(mtmd PUBLIC .)
19+
target_include_directories(mtmd PRIVATE ../..)
20+
target_include_directories(mtmd PRIVATE ../../vendor)
21+
target_compile_features (mtmd PRIVATE cxx_std_17)
22+
23+
if (BUILD_SHARED_LIBS)
24+
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
25+
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
26+
target_compile_definitions(mtmd PUBLIC LLAMA_SHARED)
27+
endif()
28+
29+
set(MTMD_PUBLIC_HEADERS
30+
${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
31+
${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
32+
)
33+
34+
set_target_properties(mtmd
35+
PROPERTIES
36+
PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
37+
38+
install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
39+
40+
if (NOT MSVC)
41+
# for stb_image.h and miniaudio.h
42+
target_compile_options(mtmd PRIVATE -Wno-cast-qual)
43+
endif()
44+
45+
if (TARGET BUILD_INFO)
46+
add_dependencies(mtmd BUILD_INFO)
47+
add_dependencies(mtmd-helper BUILD_INFO)
48+
endif()
49+
50+
add_executable(llama-llava-cli deprecation-warning.cpp)
51+
add_executable(llama-gemma3-cli deprecation-warning.cpp)
52+
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
53+
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
54+
55+
set(TARGET llama-mtmd-cli)
56+
add_executable (${TARGET} mtmd-cli.cpp)
57+
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
58+
if(LLAMA_TOOLS_INSTALL)
59+
install(TARGETS ${TARGET} RUNTIME)
60+
endif()
61+
target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads)
62+
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/mtmd/README.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Multimodal Support in llama.cpp
2+
3+
This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
4+
5+
> [!IMPORTANT]
6+
>
7+
> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
8+
9+
The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
10+
11+
- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
12+
- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
13+
- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
14+
- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
15+
- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
16+
17+
## Pre-quantized models
18+
19+
See the list of pre-quantized model [here](../../docs/multimodal.md)
20+
21+
## How it works and what is `mmproj`?
22+
23+
Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
24+
25+
This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
26+
27+
Consequently, running a multimodal model typically requires two GGUF files:
28+
1. The standard language model file.
29+
2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
30+
31+
## What is `libmtmd`?
32+
33+
As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
34+
35+
Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
36+
- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
37+
- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
38+
- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
39+
40+
## How to obtain `mmproj`
41+
42+
Multimodal projector (`mmproj`) files are specific to each model architecture.
43+
44+
For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file:
45+
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support
46+
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
47+
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
48+
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
49+
- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen))
50+
- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
51+
- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported)
52+
53+
For older models, please refer to the relevant guide for instructions on how to obtain or create them:
54+
55+
NOTE: conversion scripts are located under `tools/mtmd/legacy-models`
56+
57+
- [LLaVA](../../docs/multimodal/llava.md)
58+
- [MobileVLM](../../docs/multimodal/MobileVLM.md)
59+
- [GLM-Edge](../../docs/multimodal/glmedge.md)
60+
- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
61+
- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
62+
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
63+
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)

0 commit comments

Comments
 (0)