wip llava2

ngxson · ngxson · commit 235340d3ef03 · 2025-04-09T09:32:01.000+02:00
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
@@ -1,6 +1,8 @@
 #include "ggml.h"
 #include "gguf.h"
 
+#include "clip.h"
+
 #include <climits>
 #include <cstdarg>
 #include <string>
@@ -120,6 +122,23 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 //
 // logging
 //
@@ -178,6 +197,28 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
 
+//
+// cpp wrappers
+//
+
+struct clip_image_u8_deleter {
+    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
+};
+
+struct clip_image_f32_deleter {
+    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
+};
+
+struct clip_image_f32_batch_deleter {
+    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
+};
+
+typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
+typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
+typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
+
+// TODO @ngxson : we're currently having a naming clash between struct clip_image_size and function clip_image_size()
+
 //
 // common utils
 //
@@ -214,6 +255,20 @@ static void string_replace_all(std::string & s, const std::string & search, cons
     s = std::move(builder);
 }
 
+// split string by a `std::string delim` instead of `char delim`
+static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+    return tokens;
+}
+
 //
 // gguf utils
 //
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
 
 //#define CLIP_DEBUG_FUNCTIONS
 
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 #ifdef CLIP_DEBUG_FUNCTIONS
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
@@ -1618,6 +1601,12 @@ struct clip_image_f32 * clip_image_f32_init() {
     return new clip_image_f32();
 }
 
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
+    if (nx) *nx = img->nx;
+    if (ny) *ny = img->ny;
+    return img->buf.data();
+}
+
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
 void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
@@ -77,6 +77,9 @@ CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
+// nx, ny are the output image dimensions
+CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
@@ -2,11 +2,10 @@
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
-#include "clip.h"
-#include "stb_image.h"
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
+#include "llava2.h"
 
 #include <vector>
 #include <limits.h>
@@ -57,8 +56,8 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    struct clip_ctx    * ctx_clip = NULL;
-    common_init_result   llama_init;
+    llava2_context_ptr ctx_llava2;
+    common_init_result llama_init;
 
     llama_model       * model;
     llama_context     * lctx;
@@ -79,16 +78,16 @@ struct gemma3_context {
 
     void init_clip_model(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
-        if (!ctx_clip) {
+        ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{
+            /* use_gpu */   true,
+            /* n_threads */ params.cpuparams.n_threads,
+            /* verbosity */ GGML_LOG_LEVEL_INFO,
+        });
+        if (!ctx_llava2.get()) {
             LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
             exit(1);
         }
     }
-
-    ~gemma3_context() {
-        clip_free(ctx_clip);
-    }
 };
 
 struct decode_embd_batch {
@@ -271,6 +270,7 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
+        std::string prompt = "<start_of_turn>user\n<image>" + params.prompt + "<end_of_turn><start_of_turn>model\n";
         if (eval_text(ctx, "<start_of_turn>user\n")) {
             return 1;
         }
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
@@ -0,0 +1,174 @@
+#include "clip.h"
+#include "clip-impl.h"
+#include "llava2.h"
+
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+static const char * IMG_MARKER = "<image>";
+
+struct llava2_context {
+    struct clip_ctx * ctx_clip;
+    const struct llama_model * text_model;
+    std::vector<float> image_embd_v; // image embedding vector
+    int n_threads;
+
+    llava2_context(const char * mmproj_fname, 
+                   const struct llama_model * text_model,
+                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads) {
+        clip_context_params ctx_clip_params;
+        ctx_clip_params.use_gpu   = ctx_params.use_gpu;
+        ctx_clip_params.verbosity = ctx_params.verbosity;
+        ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
+        if (!ctx_clip) {
+            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
+        }
+        this->text_model = text_model;
+    }
+
+    ~llava2_context() {
+        clip_free(ctx_clip);
+    }
+};
+
+struct llava2_image_tokens_data {
+    clip_image_f32_batch_ptr batch_f32; // preprocessed image patches
+};
+
+llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
+        const struct llama_model * text_model,
+        const struct llava2_context_params ctx_params) {
+    try {
+        auto ctx = std::make_shared<llava2_context>(mmproj_fname, text_model, ctx_params);
+        return ctx;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: error: %s\n", __func__, e.what());
+        return nullptr;
+    }
+}
+
+int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output) {
+    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    bool ok = clip_image_load_from_file(fname, img_u8.get());
+    if (!ok) {
+        LOG_ERR("Unable to load image %s\n", fname);
+        return 1;
+    }
+    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
+    output.data.resize(output.nx * output.ny * 3);
+    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
+    return 0;
+}
+
+// copied from common_tokenize
+static std::vector<llama_token> llava2_tokenize_text_internal(
+    const struct llama_vocab * vocab,
+           const std::string & text,
+                        bool   add_special,
+                        bool   parse_special) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+int32_t llava2_tokenize(llava2_context_ptr & ctx,
+        std::vector<llava2_input_chunk> & output,
+        const std::string & prompt,
+        bool add_special,
+        bool parse_special,
+        const std::vector<llava2_bitmap> & bitmaps) {
+    auto vocab = llama_model_get_vocab(ctx->text_model);
+
+    std::vector<std::string> parts = string_split_str(prompt, IMG_MARKER);
+    output.clear();
+    output.reserve(parts.size());
+
+    size_t i_img = 0;
+
+    for (const auto & part : parts) {
+        //printf("tokenizing part: %s\n", part.c_str());
+        bool add_bos = &parts.front() == &part;
+        auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special);
+        if (tokens.empty()) {
+            continue;
+        }
+        output.push_back({
+            LLAVA2_INPUT_CHUNK_TYPE_TEXT,
+            std::move(tokens),
+            {},
+        });
+
+        if (&parts.back() != &part) {
+            // add image token to middle of 2 parts
+
+            if (i_img >= bitmaps.size()) {
+                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
+                return 2;
+            }
+
+            // shim layer
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmaps[i_img].nx;
+            img_u8->ny = bitmaps[i_img].ny;
+            img_u8->buf.resize(bitmaps[i_img].data.size());
+            std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
+
+            // preprocess image
+            clip_image_f32_batch_ptr batch_f32;
+            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), batch_f32.get());
+            if (!ok) {
+                LOG_ERR("Unable to preprocess image\n");
+                return 1;
+            }
+
+            llava2_image_tokens image_tokens;
+            //image_tokens.nx = ...;
+            //image_tokens.ny = ...;
+            image_tokens.n_tokens = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
+            image_tokens.data = std::unique_ptr<llava2_image_tokens_data>(
+                new llava2_image_tokens_data{
+                    std::move(batch_f32),
+                }
+            );
+
+            output.push_back({
+                LLAVA2_INPUT_CHUNK_TYPE_IMAGE,
+                {},
+                std::move(image_tokens),
+            });
+            i_img++;
+        }
+    }
+
+    return 0;
+}
+
+LLAVA2_API int32_t llava2_encode(llava2_context_ptr & ctx,
+                            const llava2_image_tokens & image_tokens) {
+    ctx->image_embd_v.reserve(image_tokens.n_tokens * clip_n_mmproj_embd(ctx->ctx_clip));
+    return clip_image_batch_encode(
+        ctx->ctx_clip,
+        ctx->n_threads,
+        image_tokens.data->batch_f32.get(),
+        ctx->image_embd_v.data());
+}
+
+LLAVA2_API float * llava2_get_output_embd(llava2_context_ptr & ctx) {
+    return ctx->image_embd_v.data();
+}
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h