wip

ngxson · ngxson · commit f6b6517c0063 · 2025-04-29T11:47:55.000+02:00
diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
@@ -112,12 +112,12 @@ struct mtmd_cli_context {
 
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
-            /* use_gpu */   params.mmproj_use_gpu,
-            /* timings */   true,
-            /* n_threads */ params.cpuparams.n_threads,
-            /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
-        }));
+        mtmd_context_params mparams = mtmd_context_params_default();
+        mparams.use_gpu = params.mmproj_use_gpu;
+        mparams.print_timings = true;
+        mparams.n_threads = params.cpuparams.n_threads;
+        mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
             exit(1);
@@ -228,7 +228,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     text.text          = formatted_chat.prompt;
     text.add_special   = add_bos;
     text.parse_special = true;
-    mtmd_input_chunks chunks;
+    std::vector<mtmd_input_chunk> chunks;
 
     if (g_is_interrupted) return 0;
 
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
@@ -21,6 +21,16 @@ enum mtmd_slice_tmpl {
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
+mtmd_context_params mtmd_context_params_default() {
+    mtmd_context_params params;
+    params.use_gpu = true;
+    params.print_timings = true;
+    params.n_threads = 4;
+    params.verbosity = GGML_LOG_LEVEL_INFO;
+    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
+    return params;
+}
+
 struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
@@ -411,7 +421,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
     return ctx->image_embd_v.data();
 }
 
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
+size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks) {
     size_t n_tokens = 0;
     for (auto & chunk : chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -462,7 +472,7 @@ struct decode_embd_batch {
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
         llama_context * lctx,
-        mtmd_input_chunks & chunks,
+        std::vector<mtmd_input_chunk> & chunks,
         llama_pos pos0,
         llama_seq_id seq_id,
         int32_t n_batch) {
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
@@ -5,9 +5,15 @@
 #include "llama.h"
 #include "clip.h"
 
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
 #include <vector>
 #include <cinttypes>
 #include <memory>
+#endif
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -23,7 +29,7 @@
 #    define MTMD_API
 #endif
 
-#ifdef __cplusplus
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 
 enum mtmd_input_chunk_type {
     MTMD_INPUT_CHUNK_TYPE_TEXT,
@@ -33,6 +39,75 @@ enum mtmd_input_chunk_type {
 struct mtmd_context;
 struct mtmd_image_tokens;
 
+//
+// C API
+// this is made to closely resemble the C++ API
+//
+
+// forward declaration for C API (the actual struct is defined in C++)
+struct mtmd_bitmap;
+struct mtmd_input_chunk;
+
+struct mtmd_context_params {
+    bool use_gpu;
+    bool print_timings;
+    int n_threads;
+    enum ggml_log_level verbosity;
+    const char * image_marker;
+};
+
+MTMD_API mtmd_context_params mtmd_context_params_default();
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+                                            const llama_model * text_model,
+                                            const mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// get output embeddings from the last encode pass
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// length of data must be nx * ny * 3
+// the data is in RGBRGBRGB... format
+// the id is optional (can be nullptr), but useful for KV cache tracking
+MTMD_API mtmd_bitmap * mtmd_bitmap_init(
+    uint32_t nx,
+    uint32_t ny,
+    const unsigned char * data,
+    const char * id, size_t id_len);
+MTMD_API uint32_t              mtmd_bitmap_get_nx  (mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny  (mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data(mtmd_bitmap * bitmap);
+MTMD_API const char *          mtmd_bitmap_get_id  (mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
+
+// mtmd_input_chunk
+//
+// the instance can be constructed via mtmd_tokenize()
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API void                       mtmd_input_chunk_free            (mtmd_input_chunk * chunk);
+
+
+//
+// C++ API
+//
+
+#ifdef __cplusplus
+
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
 struct mtmd_bitmap {
@@ -53,30 +128,12 @@ struct mtmd_input_chunk {
     mtmd_image_tokens_ptr tokens_image;
 };
 
-using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
-
-struct mtmd_context_params {
-    bool use_gpu = true;
-    bool print_timings = true;
-    int n_threads = 4;
-    enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
-    const char * image_marker = "<__image__>";
-};
-
 struct mtmd_input_text {
     std::string text;
     bool add_special;
     bool parse_special;
 };
 
-// initialize the mtmd context
-// return nullptr on failure
-MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
-                                                const llama_model * text_model,
-                                                const mtmd_context_params ctx_params);
-
-MTMD_API void mtmd_free(mtmd_context * ctx);
-
 // tokenize an input text prompt and an image
 // the prompt must have the input image marker (default: "<__image__>") in it
 // the marker will be replaced with the image tokens
@@ -108,20 +165,14 @@ MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
                             const mtmd_image_tokens * image_tokens);
 
-// get output embeddings from the last encode pass
-MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
-
-// whether we need to set non-causal mask before llama_decode
-MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
-
 
 
 //
 // helper functions (can be implemented based on other functions)
 //
 
 // helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
+MTMD_API size_t mtmd_helper_get_n_tokens(std::vector<mtmd_input_chunk> & chunks);
 
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
@@ -130,7 +181,7 @@ MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 // otherwise, returns 0 on success
 MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
                                 llama_context * lctx,
-                                mtmd_input_chunks & chunks,
+                                std::vector<mtmd_input_chunk> & chunks,
                                 llama_pos pos0,
                                 llama_seq_id seq_id,
                                 int32_t n_batch);
@@ -146,18 +197,7 @@ MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitm
 // this function is thread-safe
 MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
 
-// convenient unique_ptr wrappers
-struct mtmd_context_deleter {
-    void operator()(mtmd_context * val) { mtmd_free(val); }
-};
-using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
-
 #endif
 
-//
-// C API
-//
-
-
 
 #endif