Skip to content

Commit 117bf73

Browse files
committed
update llava2_tokenize
1 parent 1576c82 commit 117bf73

File tree

3 files changed

+23
-12
lines changed

3 files changed

+23
-12
lines changed

examples/llava/gemma3-cli.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,11 @@ static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector
181181
}
182182

183183
std::vector<llava2_input_chunk> chunks;
184-
if (llava2_tokenize(ctx.ctx_vision, chunks, formatted_chat.prompt, add_bos, true, bitmaps)) {
184+
llava2_input_text text;
185+
text.text = formatted_chat.prompt;
186+
text.add_special = add_bos;
187+
text.parse_special = true;
188+
if (llava2_tokenize(ctx.ctx_vision, chunks, text, bitmaps)) {
185189
LOG_ERR("Unable to tokenize prompt\n");
186190
return 1;
187191
}

examples/llava/llava2.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,11 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
9191

9292
int32_t llava2_tokenize(llava2_context_ptr & ctx,
9393
std::vector<llava2_input_chunk> & output,
94-
const std::string & prompt,
95-
bool add_special,
96-
bool parse_special,
94+
const llava2_input_text & text,
9795
const std::vector<llava2_bitmap> & bitmaps) {
9896
auto vocab = llama_model_get_vocab(ctx->text_model);
9997

100-
std::string prompt_modified(prompt);
98+
std::string prompt_modified(text.text);
10199
std::string marker_modified(ctx->image_marker);
102100
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
103101
// a bit hacky here, but works for now
@@ -108,7 +106,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
108106
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
109107
}
110108

111-
std::vector<std::string> parts = string_split_str(prompt, ctx->image_marker);
109+
std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
112110
output.clear();
113111
output.reserve(parts.size());
114112

@@ -117,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
117115
for (const auto & part : parts) {
118116
//printf("tokenizing part: %s\n", part.c_str());
119117
bool add_bos = &parts.front() == &part;
120-
auto tokens = llava2_tokenize_text_internal(vocab, part, add_special && add_bos, parse_special);
118+
auto tokens = llava2_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
121119
if (tokens.empty()) {
122120
continue;
123121
}
@@ -273,14 +271,17 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
273271
} else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
274272
GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
275273
int64_t t0 = ggml_time_ms();
274+
if (ctx->print_timings) {
275+
LOG_INF("encoding image...\n");
276+
}
276277
ret = llava2_encode(ctx, chunk.tokens_image);
277278
if (ret != 0) {
278279
LOG_ERR("failed to encode image\n");
279280
llama_batch_free(text_batch);
280281
return ret;
281282
}
282283
if (ctx->print_timings) {
283-
LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
284+
LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
284285
}
285286

286287
int32_t n_tokens = chunk.tokens_image.n_tokens;
@@ -294,7 +295,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
294295
return ret;
295296
}
296297
if (ctx->print_timings) {
297-
LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
298+
LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
298299
}
299300

300301
n_past += n_tokens;

examples/llava/llava2.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ struct llava2_context_params {
6666
const char * image_marker = "<__image__>";
6767
};
6868

69+
struct llava2_input_text {
70+
std::string text;
71+
bool add_special;
72+
bool parse_special;
73+
};
74+
6975
// initialize the llava2 context
7076
// return nullptr on failure
7177
LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
@@ -74,6 +80,7 @@ LLAVA2_API llava2_context_ptr llava2_init_from_file(const char * mmproj_fname,
7480

7581
// helper function to load an image from a file
7682
// returns 0 on success
83+
// this function is thread-safe
7784
LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitmap & output);
7885

7986
// tokenize an input text prompt and an image
@@ -86,11 +93,10 @@ LLAVA2_API int32_t llava2_bitmap_init_from_file(const char * fname, llava2_bitma
8693
// 2. (image tokens)
8794
// 3. "<end_of_image>\ndescribe it in detail."
8895
// number of bitmaps must be equal to the number of image markers in the prompt
96+
// this function is thread-safe (shared ctx)
8997
LLAVA2_API int32_t llava2_tokenize(llava2_context_ptr & ctx,
9098
std::vector<llava2_input_chunk> & output,
91-
const std::string & prompt,
92-
bool add_special,
93-
bool parse_special,
99+
const llava2_input_text & text,
94100
const std::vector<llava2_bitmap> & bitmaps);
95101

96102
// returns 0 on success

0 commit comments

Comments
 (0)