Extra llama-cpp model related message coming when enable compilation flag

Anoop Kapoor · Anoop Kapoor · commit b6fd00801e02 · 2025-10-03T17:05:57.000-07:00
GGML_PERF_DETAIL. Fix this issue
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2879,12 +2879,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct ggml_tensor * node = cgraph->nodes[node_n];
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
         int64_t t_start = ggml_time_us();
-#endif
+#endif /* GGML_PERF  || GGML_PERF_DETAIL */
         ggml_compute_forward(&params, node);
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
         int64_t t_end = ggml_time_us();
         node->perf_runs++;
         if (t_end >= t_start) {
@@ -2893,7 +2893,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             // Handle wraparound by assuming timer rolls over at max int64_t value
             node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
         }
-#endif
+#endif /* GGML_PERF  || GGML_PERF_DETAIL */
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1090,16 +1090,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
         ggml_status status;
         const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF)
         ggml_perf_accumulate(perf_totals, res->get_gf());
-#endif /* GGML_PERF  */
-
-#ifdef GGML_PERF_DETAIL
-        if (perf_all_shape_fp) {
-            ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp);
-        }
+#elif defined(GGML_PERF_DETAIL)
         ggml_perf_accumulate(perf_totals, res->get_gf());
-#endif /* GGML_PERF_DETAI */
+        ggml_perf_write_detailed_csv(res->get_gf(), perf_all_shape_fp);
+#endif /* GGML_PERF || GGML_PERF_DETAIL */
 
 
         if (!res) {
@@ -2763,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
 }
 
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF)
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
     LLAMA_LOG_TSAVORITE("  %-16s  %7s  %14s  %16s\n", "Op", "Runs", "Total us", "Avg us");
@@ -2791,7 +2787,8 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
         }
     }
 }
-#elif GGML_PERF_DETAIL
+
+#elif defined(GGML_PERF_DETAIL)
 void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
     LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n");
     LLAMA_LOG_TSAVORITE("  %-16s %-8s %7s  %14s  %16s\n", "Op", "Target", "Runs", "Total us", "Avg us");
@@ -2855,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) {
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
     LLAMA_LOG_TSAVORITE("\n%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
     LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
@@ -2864,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) {
     LLAMA_LOG_TSAVORITE("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 
     ggml_perf_print_totals(const_cast<ggml_perf_totals *>(ctx->perf_totals));
-#endif /* GGML_PERF */
+#endif /* GGML_PERF  || GGML_PERF_DETAIL */
 }
 
 void llama_perf_context_reset(llama_context * ctx) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -198,9 +198,9 @@ struct llama_context {
 
     // reserve a graph with a dummy ubatch of the specified size
     ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
-#ifdef GGML_PERF
+#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
     struct ggml_perf_totals perf_totals[GGML_OP_COUNT] = {};  // add this to llama_context
-#endif
+#endif /* GGML_PERF  || GGML_PERF_DETAIL */
 
 private:
     llm_graph_params graph_params(
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
@@ -126,9 +126,9 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-#ifdef GGML_PERF
+#if defined(GGML_PERF) || defined(GGML_PERF_DETAIL)
     llama_log_set(my_logger, nullptr);
-#endif /* GGML_PERF */
+#endif /* GGML_PERF  || GGML_PERF_DETAIL */
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();