update llava2_tokenize

ngxson · ngxson · commit 117bf734cc36 · 2025-04-09T17:55:14.000+02:00
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
@@ -181,7 +181,11 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
     }
 
     std::vector<llava2_input_chunk> chunks;
-    if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {
+    llava2_input_text text;
+    text.text          = formatted_chat.prompt;
+    text.add_special   = add_bos;
+    text.parse_special = true;
+    if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
         LOG_ERR("Unable to tokenize prompt\n");
         return 1;
     }
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
@@ -91,13 +91,11 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
 
 int32_t llava2_tokenize(llava2_context_ptr & ctx,
         std::vector<llava2_input_chunk> & output,
-        const std::string & prompt,
-        bool add_special,
-        bool parse_special,
+        const llava2_input_text & text,
         const std::vector<llava2_bitmap> & bitmaps) {
     auto vocab = llama_model_get_vocab(ctx->text_model);
 
-    std::string prompt_modified(prompt);
+    std::string prompt_modified(text.text);
     std::string marker_modified(ctx->image_marker);
     projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
     // a bit hacky here, but works for now
@@ -108,7 +106,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
-    std::vector<std::string> parts = string_split_str(prompt, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 
@@ -117,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
     for (const auto & part : parts) {
         //printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
-        auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special);
+        auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
         if (tokens.empty()) {
             continue;
         }
@@ -273,14 +271,17 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
         } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
             int64_t t0 = ggml_time_ms();
+            if (ctx->print_timings) {
+                LOG_INF("encoding image...\n");
+            }
             ret = llava2_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
             if (ctx->print_timings) {
-                LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+                LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
@@ -294,7 +295,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
                 return ret;
             }
             if (ctx->print_timings) {
-                LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+                LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
             }
 
             n_past += n_tokens;
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
@@ -66,6 +66,12 @@ struct llava2_context_params {
     const char * image_marker = "<__image__>";
 };
 
+struct llava2_input_text {
+    std::string text;
+    bool add_special;
+    bool parse_special;
+};
+
 // initialize the llava2 context
 // return nullptr on failure
 LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
@@ -74,6 +80,7 @@ LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
 
 // helper function to load an image from a file
 // returns 0 on success
+// this function is thread-safe
 LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
 
 // tokenize an input text prompt and an image
@@ -86,11 +93,10 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma
 //   2. (image tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
 // number of bitmaps must be equal to the number of image markers in the prompt
+// this function is thread-safe (shared ctx)
 LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
                                 std::vector<llava2_input_chunk> & output,
-                                const std::string & prompt,
-                                bool add_special,
-                                bool parse_special,
+                                const llava2_input_text & text,
                                 const std::vector<llava2_bitmap> & bitmaps);
 
 // returns 0 on success

Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,11 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector`
`181`	`181`	`}`
`182`	`182`
`183`	`183`	`std::vector<llava2_input_chunk> chunks;`
`184`		`- if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {`
	`184`	`+ llava2_input_text text;`
	`185`	`+ text.text = formatted_chat.prompt;`
	`186`	`+ text.add_special = add_bos;`
	`187`	`+ text.parse_special = true;`
	`188`	`+ if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {`
`185`	`189`	`LOG_ERR("Unable to tokenize prompt\n");`
`186`	`190`	`return 1;`
`187`	`191`	`}`