change name llava2 --> mtmd

ngxson · ngxson · commit a6625fa68b0c · 2025-04-10T12:00:30.000+02:00
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
@@ -24,41 +24,41 @@ if (BUILD_SHARED_LIBS)
     install(TARGETS llava_shared LIBRARY)
 endif()
 
-# llava2
+# mtmd
 
-add_library(llava2 OBJECT
-            llava2.cpp
-            llava2.h
+add_library(mtmd OBJECT
+            mtmd.cpp
+            mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
             )
 
-target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 
-target_include_directories(llava2 PUBLIC .)
-target_include_directories(llava2 PUBLIC ../..)
-target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h
+target_include_directories(mtmd PUBLIC .)
+target_include_directories(mtmd PUBLIC ../..)
+target_include_directories(mtmd PUBLIC ../../common) # for stb_image.h
 
-target_compile_features(llava2 PRIVATE cxx_std_17)
+target_compile_features(mtmd PRIVATE cxx_std_17)
 
-add_library(llava2_static STATIC $<TARGET_OBJECTS:llava2>)
+add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
 if (BUILD_SHARED_LIBS)
-    set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(llava2_shared SHARED $<TARGET_OBJECTS:llava2>)
-    target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS llava2_shared LIBRARY)
+    set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
+    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS mtmd_shared LIBRARY)
 endif()
 
 if (NOT MSVC)
     target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-    target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 
 if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
-    add_dependencies(llava2 BUILD_INFO)
+    add_dependencies(mtmd BUILD_INFO)
 endif()
 
 set(TARGET llama-llava-cli)
@@ -86,7 +86,7 @@ set(TARGET llama-gemma3-cli)
 add_executable(${TARGET} gemma3-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-llava-clip-quantize-cli)
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
@@ -329,7 +329,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 }
 
 //
-// API used internally with llava2
+// API used internally with mtmd
 //
 
 projector_type clip_get_projector_type(const struct clip_ctx * ctx);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -2886,7 +2886,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
 }
 
 //
-// API used internally with llava2
+// API used internally with mtmd
 //
 
 projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
@@ -6,7 +6,7 @@
 #include "ggml.h"
 #include "console.h"
 #include "chat.h"
-#include "llava2.h"
+#include "mtmd.h"
 
 #include <vector>
 #include <limits.h>
@@ -57,7 +57,7 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    llava2_context_ptr ctx_vision;
+    mtmd_context_ptr ctx_vision;
     common_init_result llama_init;
 
     llama_model       * model;
@@ -86,7 +86,7 @@ struct gemma3_context {
 
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
+        ctx_vision = mtmd_init_from_file(clip_path, model, mtmd_context_params{
             /* use_gpu */   true,
             /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
@@ -162,7 +162,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
 }
 
 static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
-    std::vector<llava2_bitmap> bitmaps;
+    std::vector<mtmd_bitmap> bitmaps;
 
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
@@ -172,30 +172,30 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 
     for (auto & fname : images_fname) {
-        llava2_bitmap bitmap;
-        if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) {
+        mtmd_bitmap bitmap;
+        if (mtmd_bitmap_init_from_file(fname.c_str(), bitmap)) {
             LOG_ERR("Unable to load image %s\n", fname.c_str());
             return 2; // image not found
         }
         bitmaps.push_back(std::move(bitmap));
     }
 
-    std::vector<llava2_input_chunk> chunks;
-    llava2_input_text text;
+    std::vector<mtmd_input_chunk> chunks;
+    mtmd_input_text text;
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
+    if (mtmd_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
         LOG_ERR("Unable to tokenize prompt\n");
         return 1;
     }
 
-    if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+    if (mtmd_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
         LOG_ERR("Unable to eval prompt\n");
         return 1;
     }
 
-    ctx.n_past += llava2_helper_get_n_tokens(chunks);
+    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
 
     return 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
@@ -1,6 +1,6 @@
 #include "clip.h"
 #include "clip-impl.h"
-#include "llava2.h"
+#include "mtmd.h"
 
 #include "llama.h"
 
@@ -12,7 +12,7 @@
 #include <limits>
 #include <vector>
 
-struct llava2_context {
+struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
@@ -22,9 +22,9 @@ struct llava2_context {
 
     // TODO @ngxson : add timings
 
-    llava2_context(const char * mmproj_fname,
+    mtmd_context(const char * mmproj_fname,
                    const struct llama_model * text_model,
-                   const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const struct mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -35,28 +35,28 @@ struct llava2_context {
         this->text_model = text_model;
     }
 
-    ~llava2_context() {
+    ~mtmd_context() {
         clip_free(ctx_clip);
     }
 };
 
-struct llava2_image_tokens_data {
+struct mtmd_image_tokens_data {
     clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
 };
 
-llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+mtmd_context_ptr mtmd_init_from_file(const char * mmproj_fname,
         const struct llama_model * text_model,
-        const struct llava2_context_params ctx_params) {
+        const struct mtmd_context_params ctx_params) {
     try {
-        auto ctx = std::make_shared<llava2_context>(mmproj_fname, text_model, ctx_params);
+        auto ctx = std::make_shared<mtmd_context>(mmproj_fname, text_model, ctx_params);
         return ctx;
     } catch (const std::exception & e) {
         LOG_ERR("%s: error: %s\n", __func__, e.what());
         return nullptr;
     }
 }
 
-int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) {
+int32_t mtmd_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
     clip_image_u8_ptr img_u8(clip_image_u8_init());
     bool ok = clip_image_load_from_file(fname, img_u8.get());
     if (!ok) {
@@ -70,7 +70,7 @@ int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output)
 }
 
 // copied from common_tokenize
-static std::vector<llama_token> llava2_tokenize_text_internal(
+static std::vector<llama_token> mtmd_tokenize_text_internal(
     const struct llama_vocab * vocab,
            const std::string & text,
                         bool   add_special,
@@ -89,10 +89,10 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
     return result;
 }
 
-int32_t llava2_tokenize(llava2_context_ptr & ctx,
-        std::vector<llava2_input_chunk> & output,
-        const llava2_input_text & text,
-        const std::vector<llava2_bitmap> & bitmaps) {
+int32_t mtmd_tokenize(mtmd_context_ptr & ctx,
+        std::vector<mtmd_input_chunk> & output,
+        const mtmd_input_text & text,
+        const std::vector<mtmd_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
     std::string prompt_modified(text.text);
@@ -115,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     for (const auto & part : parts) {
         //printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
-        auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
+        auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
         if (tokens.empty()) {
             continue;
         }
@@ -148,12 +148,12 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
                 return 1;
             }
 
-            llava2_image_tokens image_tokens;
+            mtmd_image_tokens image_tokens;
             image_tokens.nx = 0; // TODO
             image_tokens.ny = 0; // TODO
             image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
-            image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
-                new llava2_image_tokens_data{
+            image_tokens.data = std::unique_ptr<mtmd_image_tokens_data>(
+                new mtmd_image_tokens_data{
                     std::move(batch_f32),
                 }
             );
@@ -170,8 +170,8 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     return 0;
 }
 
-LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
-                            const llava2_image_tokens & image_tokens) {
+LLAVA2_API int32_t mtmd_encode(mtmd_context_ptr & ctx,
+                            const mtmd_image_tokens & image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens.n_tokens * n_mmproj_embd);
     bool ok = clip_image_batch_encode(
@@ -182,11 +182,11 @@ LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
     return ok ? 0 : 1;
 }
 
-LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
+LLAVA2_API float * mtmd_get_output_embd(mtmd_context_ptr & ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t llava2_helper_get_n_tokens(std::vector<llava2_input_chunk> & chunks) {
+size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
     size_t n_tokens = 0;
     for (auto & chunk : chunks) {
         if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_TEXT) {
@@ -235,9 +235,9 @@ struct decode_embd_batch {
     }
 };
 
-int32_t llava2_helper_eval(llava2_context_ptr & ctx,
+int32_t mtmd_helper_eval(mtmd_context_ptr & ctx,
         llama_context * lctx,
-        std::vector<llava2_input_chunk> & chunks,
+        std::vector<mtmd_input_chunk> & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
@@ -274,7 +274,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
             if (ctx->print_timings) {
                 LOG_INF("encoding image...\n");
             }
-            ret = llava2_encode(ctx, chunk.tokens_image);
+            ret = mtmd_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
@@ -285,7 +285,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
             }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
-            float * embd = llava2_get_output_embd(ctx);
+            float * embd = mtmd_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
             int64_t t1 = ggml_time_ms();
             ret = llama_decode(lctx, batch_img.batch);
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h

Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {`
`329`	`329`	`}`
`330`	`330`
`331`	`331`	`//`
`332`		`-// API used internally with llava2`
	`332`	`+// API used internally with mtmd`
`333`	`333`	`//`
`334`	`334`
`335`	`335`	`projector_type clip_get_projector_type(const struct clip_ctx * ctx);`
Original file line number	Diff line number	Diff line change
`@@ -2886,7 +2886,7 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,`
`2886`	`2886`	`}`
`2887`	`2887`
`2888`	`2888`	`//`
`2889`		`-// API used internally with llava2`
	`2889`	`+// API used internally with mtmd`
`2890`	`2890`	`//`
`2891`	`2891`
`2892`	`2892`	`projector_type clip_get_projector_type(const struct clip_ctx * ctx) {`