ggml-org
diff --git a/‎examples/llava/CMakeLists.txt‎
Lines changed: 31 additions & 1 deletion b/‎examples/llava/CMakeLists.txt‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎examples/llava/clip.cpp‎
Lines changed: 2 additions & 0 deletions b/‎examples/llava/clip.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 76 additions & 104 deletions b/‎examples/llava/gemma3-cli.cpp‎
Lines changed: 76 additions & 104 deletions
@@ -1,3 +1,5 @@
+# llava (legacy)
+
 add_library(llava OBJECT
             llava.cpp
             llava.h
@@ -22,12 +24,40 @@ if (BUILD_SHARED_LIBS)
     install(TARGETS llava_shared LIBRARY)
 endif()
 
+# llava2
+
+add_library(llava2 OBJECT
+            llava2.cpp
+            llava2.h
+            clip.cpp
+            clip.h
+            clip-impl.h
+            )
+
+target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(llava2 PUBLIC .)
+target_include_directories(llava2 PUBLIC ../..)
+
+target_compile_features(llava2 PRIVATE cxx_std_17)
+
+add_library(llava2_static STATIC $<TARGET_OBJECTS:llava2>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llava2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llava2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(llava2_shared SHARED $<TARGET_OBJECTS:llava2>)
+    target_link_libraries(llava2_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS llava2_shared LIBRARY)
+endif()
+
 if (NOT MSVC)
     target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(llava2 PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 
 if(TARGET BUILD_INFO)
     add_dependencies(llava BUILD_INFO)
+    add_dependencies(llava2 BUILD_INFO)
 endif()
 
 set(TARGET llama-llava-cli)
@@ -55,7 +85,7 @@ set(TARGET llama-gemma3-cli)
 add_executable(${TARGET} gemma3-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llava2 ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-llava-clip-quantize-cli)
 
@@ -2330,6 +2330,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
         int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
         n_patches = x_patch * y_patch;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        n_patches = 256;
     }
 
     return n_patches;
 
@@ -5,6 +5,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
+#include "chat.h"
 #include "llava2.h"
 
 #include <vector>
@@ -56,13 +57,18 @@ static void sigint_handler(int signo) {
 #endif
 
 struct gemma3_context {
-    llava2_context_ptr ctx_llava2;
+    llava2_context_ptr ctx_vision;
     common_init_result llama_init;
 
     llama_model       * model;
     llama_context     * lctx;
     const llama_vocab * vocab;
     llama_batch         batch;
+    int                 n_batch;
+
+    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
+    // so here we don't need to keep track of chat history
+    common_chat_templates_ptr tmpls;
 
     int n_threads    = 1;
     llama_pos n_past = 0;
@@ -73,18 +79,20 @@ struct gemma3_context {
         vocab = llama_model_get_vocab(model);
         n_threads = params.cpuparams.n_threads;
         batch = llama_batch_init(params.n_batch, 0, 1);
-        init_clip_model(params);
+        n_batch = params.n_batch;
+        tmpls = common_chat_templates_init(model, params.chat_template);
+        init_vision_context(params);
     }
 
-    void init_clip_model(common_params & params) {
+    void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
-        ctx_llava2 = llava2_init_from_file(clip_path, model, llava2_context_params{
+        ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
             /* use_gpu */   true,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         });
-        if (!ctx_llava2.get()) {
-            LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
+        if (!ctx_vision.get()) {
+            LOG_ERR("Failed to load vision model from %s\n", clip_path);
             exit(1);
         }
     }
@@ -123,77 +131,6 @@ struct decode_embd_batch {
     }
 };
 
-static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
-    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
-    common_batch_clear(ctx.batch);
-    for (llama_token & t : tokens) {
-        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
-    }
-    if (logits_last) {
-        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
-    }
-    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
-    if (llama_decode(ctx.lctx, ctx.batch)) {
-        LOG_ERR("Failed to decode text\n");
-        return 1;
-    }
-    return 0;
-}
-
-static int eval_image(gemma3_context & ctx, std::string & fname) {
-    std::vector<float> image_embd_v;
-    int n_embd = llama_model_n_embd(ctx.model);
-    int n_tokens = 256;
-    image_embd_v.resize(n_tokens * n_embd);
-
-    bool ok;
-    struct clip_image_u8 * img_u8 = clip_image_u8_init();
-    ok = clip_image_load_from_file(fname.c_str(), img_u8);
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname.c_str());
-        clip_image_u8_free(img_u8);
-        return 2; // non-fatal error
-    }
-
-    clip_image_f32_batch batch_f32;
-    ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
-    if (!ok) {
-        LOG_ERR("Unable to preprocess image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-
-    int64_t t0 = ggml_time_ms();
-    LOG("Encoding image %s\n", fname.c_str());
-    ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
-    if (!ok) {
-        LOG_ERR("Unable to encode image\n");
-        clip_image_f32_batch_free(&batch_f32);
-        clip_image_u8_free(img_u8);
-        return 1;
-    }
-    LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-
-    clip_image_f32_batch_free(&batch_f32);
-    clip_image_u8_free(img_u8);
-
-    // decode image embeddings
-    int64_t t1 = ggml_time_ms();
-    eval_text(ctx, "<start_of_image>");
-    llama_set_causal_attn(ctx.lctx, false);
-    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
-    if (llama_decode(ctx.lctx, batch_img.batch)) {
-        LOG_ERR("failed to decode image\n");
-        return 1;
-    }
-    ctx.n_past += n_tokens;
-    llama_set_causal_attn(ctx.lctx, true);
-    eval_text(ctx, "<end_of_image>");
-    LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
-    return 0;
-}
-
 static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
     for (int i = 0; i < n_predict; i++) {
         if (i > n_predict || !g_is_generating) {
@@ -223,6 +160,41 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
     return 0;
 }
 
+static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
+    std::vector<llava2_bitmap> bitmaps;
+
+    common_chat_templates_inputs tmpl_inputs;
+    tmpl_inputs.messages = {msg};
+    tmpl_inputs.add_generation_prompt = true;
+    tmpl_inputs.use_jinja = false; // jinja is buggy here
+    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
+
+    for (auto & fname : images_fname) {
+        llava2_bitmap bitmap;
+        if (llava2_bitmap_init_from_file(fname.c_str(), bitmap)) {
+            LOG_ERR("Unable to load image %s\n", fname.c_str());
+            return 2; // image not found
+        }
+        bitmaps.push_back(std::move(bitmap));
+    }
+
+    std::vector<llava2_input_chunk> chunks;
+    if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {
+        LOG_ERR("Unable to tokenize prompt\n");
+        return 1;
+    }
+
+    if (llava2_helper_eval(ctx.ctx_vision, ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)) {
+        LOG_ERR("Unable to eval prompt\n");
+        return 1;
+    }
+
+    ctx.n_past += llava2_helper_get_n_tokens(chunks);
+
+    return 0;
+}
+
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -264,22 +236,15 @@ int main(int argc, char ** argv) {
 #endif
     }
 
-    if (eval_text(ctx, "<bos>")) {
-        return 1;
-    }
-
     if (is_single_turn) {
         g_is_generating = true;
-        std::string prompt = "<start_of_turn>user\n<image>" + params.prompt + "<end_of_turn><start_of_turn>model\n";
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
-        for (auto & fname : params.image) {
-            if (eval_image(ctx, fname)) {
-                return 1;
-            }
+        if (params.prompt.find("<__image__>") == std::string::npos) {
+            params.prompt += " <__image__>";
         }
-        if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = params.prompt;
+        if (eval_message(ctx, msg, params.image, true)) {
             return 1;
         }
         if (generate_response(ctx, smpl, n_predict)) {
@@ -293,9 +258,9 @@ int main(int argc, char ** argv) {
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
 
-        if (eval_text(ctx, "<start_of_turn>user\n")) {
-            return 1;
-        }
+        bool is_first_msg = true;
+        std::vector<std::string> images_fname;
+        std::string content;
 
         while (true) {
             g_is_generating = false;
@@ -320,24 +285,31 @@ int main(int argc, char ** argv) {
             g_is_generating = true;
             if (line.find("/image") == 0) {
                 std::string image = line.substr(7);
-                int res = eval_image(ctx, image);
-                if (res == 2) {
-                    continue; // image not found
-                }
-                if (res) {
-                    return 1;
-                }
+                images_fname.push_back(string_strip(image));
+                content += "<__image__>";
                 continue;
+            } else {
+                content += line;
             }
-            if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
-                return 1;
+            common_chat_msg msg;
+            msg.role = "user";
+            msg.content = content;
+            int ret = eval_message(ctx, msg, images_fname, is_first_msg);
+            if (ret == 2) {
+                // non-fatal error
+                images_fname.clear();
+                content.clear();
+                continue;
             }
-            if (generate_response(ctx, smpl, n_predict)) {
+            if (ret) {
                 return 1;
             }
-            if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
+            if (generate_response(ctx, smpl, n_predict)) {
                 return 1;
             }
+            images_fname.clear();
+            content.clear();
+            is_first_msg = false;
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -2330,6 +2330,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i`
`2330`	`2330`	`int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);`
`2331`	`2331`	`int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);`
`2332`	`2332`	`n_patches = x_patch * y_patch;`
	`2333`	`+ } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {`
	`2334`	`+ n_patches = 256;`
`2333`	`2335`	`}`
`2334`	`2336`
`2335`	`2337`	`return n_patches;`