@@ -91,13 +91,11 @@ static std::vector<llama_token> llava2_tokenize_text_internal(
9191
9292int32_t llava2_tokenize (llava2_context_ptr & ctx,
9393 std::vector<llava2_input_chunk> & output,
94- const std::string & prompt,
95- bool add_special,
96- bool parse_special,
94+ const llava2_input_text & text,
9795 const std::vector<llava2_bitmap> & bitmaps) {
9896 auto vocab = llama_model_get_vocab (ctx->text_model );
9997
100- std::string prompt_modified (prompt );
98+ std::string prompt_modified (text. text );
10199 std::string marker_modified (ctx->image_marker );
102100 projector_type proj_type = clip_get_projector_type (ctx->ctx_clip );
103101 // a bit hacky here, but works for now
@@ -108,7 +106,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
108106 string_replace_all (prompt_modified, ctx->image_marker , marker_modified);
109107 }
110108
111- std::vector<std::string> parts = string_split_str (prompt , ctx->image_marker );
109+ std::vector<std::string> parts = string_split_str (text. text , ctx->image_marker );
112110 output.clear ();
113111 output.reserve (parts.size ());
114112
@@ -117,7 +115,7 @@ int32_t llava2_tokenize(llava2_context_ptr & ctx,
117115 for (const auto & part : parts) {
118116 // printf("tokenizing part: %s\n", part.c_str());
119117 bool add_bos = &parts.front () == ∂
120- auto tokens = llava2_tokenize_text_internal (vocab, part, add_special && add_bos, parse_special);
118+ auto tokens = llava2_tokenize_text_internal (vocab, part, text. add_special && add_bos, text. parse_special );
121119 if (tokens.empty ()) {
122120 continue ;
123121 }
@@ -273,14 +271,17 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
273271 } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
274272 GGML_ASSERT (!is_last && " logits for last image chunk is not yet support" );
275273 int64_t t0 = ggml_time_ms ();
274+ if (ctx->print_timings ) {
275+ LOG_INF (" encoding image...\n " );
276+ }
276277 ret = llava2_encode (ctx, chunk.tokens_image );
277278 if (ret != 0 ) {
278279 LOG_ERR (" failed to encode image\n " );
279280 llama_batch_free (text_batch);
280281 return ret;
281282 }
282283 if (ctx->print_timings ) {
283- LOG_INF (" Image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
284+ LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
284285 }
285286
286287 int32_t n_tokens = chunk.tokens_image .n_tokens ;
@@ -294,7 +295,7 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
294295 return ret;
295296 }
296297 if (ctx->print_timings ) {
297- LOG_INF (" Image decoded in %" PRId64 " ms\n " , ggml_time_ms () - t1);
298+ LOG_INF (" image decoded in %" PRId64 " ms\n " , ggml_time_ms () - t1);
298299 }
299300
300301 n_past += n_tokens;
0 commit comments