add timings

ngxson · ngxson · commit 7cc4108a9b6b · 2025-04-09T14:47:07.000+02:00
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
@@ -38,6 +38,7 @@ target_link_libraries(llava2 PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
 
 target_include_directories(llava2 PUBLIC .)
 target_include_directories(llava2 PUBLIC ../..)
+target_include_directories(llava2 PUBLIC ../../common) # for stb_image.h
 
 target_compile_features(llava2 PRIVATE cxx_std_17)
 
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
@@ -88,6 +88,7 @@ struct gemma3_context {
         const char * clip_path = params.mmproj.path.c_str();
         ctx_vision = llava2_init_from_file(clip_path, model, llava2_context_params{
             /* use_gpu */   true,
+            /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
             /* verbosity */ GGML_LOG_LEVEL_INFO,
         });
diff --git a/examples/llava/llava2.cpp b/examples/llava/llava2.cpp
@@ -16,14 +16,15 @@ struct llava2_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
+    bool print_timings;
     int n_threads;
     std::string image_marker;
 
     // TODO @ngxson : add timings
 
     llava2_context(const char * mmproj_fname,
                    const struct llama_model * text_model,
-                   const struct llava2_context_params & ctx_params) : n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
+                   const struct llava2_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
@@ -260,22 +261,30 @@ int32_t llava2_helper_eval(llava2_context_ptr & ctx,
 
         } else if (chunk.type == LLAVA2_INPUT_CHUNK_TYPE_IMAGE) {
             GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
+            int64_t t0 = ggml_time_ms();
             ret = llava2_encode(ctx, chunk.tokens_image);
             if (ret != 0) {
                 LOG_ERR("failed to encode image\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
+            if (ctx->print_timings) {
+                LOG_INF("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+            }
 
             int32_t n_tokens = chunk.tokens_image.n_tokens;
             float * embd = llava2_get_output_embd(ctx);
             decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
+            int64_t t1 = ggml_time_ms();
             ret = llama_decode(lctx, batch_img.batch);
             if (ret != 0) {
                 LOG_ERR("failed to decode image\n");
                 llama_batch_free(text_batch);
                 return ret;
             }
+            if (ctx->print_timings) {
+                LOG_INF("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+            }
 
             n_past += n_tokens;
 
diff --git a/examples/llava/llava2.h b/examples/llava/llava2.h
@@ -60,6 +60,7 @@ struct llava2_input_chunk {
 
 struct llava2_context_params {
     bool use_gpu = true;
+    bool print_timings = true;
     int n_threads = 4;
     enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
     const char * image_marker = "<__image__>";