@@ -1090,6 +1090,14 @@ int llama_context::decode(const llama_batch & batch_inp) {
10901090 ggml_status status;
10911091 const auto * res = process_ubatch (ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get (), status);
10921092
1093+ #if defined(GGML_PERF)
1094+ ggml_perf_accumulate (perf_totals, res->get_gf ());
1095+ #elif defined(GGML_PERF_DETAIL)
1096+ ggml_perf_accumulate (perf_totals, res->get_gf ());
1097+ ggml_perf_write_detailed_csv (res->get_gf (), perf_all_shape_fp);
1098+ #endif /* GGML_PERF || GGML_PERF_DETAIL */
1099+
1100+
10931101 if (!res) {
10941102 // the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
10951103 llama_pos pos_min[LLAMA_MAX_SEQ];
@@ -2751,7 +2759,7 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) {
27512759}
27522760
27532761
2754- #ifdef GGML_PERF
2762+ #if defined( GGML_PERF)
27552763void ggml_perf_print_totals (struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27562764 LLAMA_LOG_TSAVORITE (" \n === GGML Perf Summary ===\n " );
27572765 LLAMA_LOG_TSAVORITE (" %-16s %7s %14s %16s\n " , " Op" , " Runs" , " Total us" , " Avg us" );
@@ -2779,7 +2787,8 @@ void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27792787 }
27802788 }
27812789}
2782- #elif GGML_PERF_DETAIL
2790+
2791+ #elif defined(GGML_PERF_DETAIL)
27832792void ggml_perf_print_totals (struct ggml_perf_totals totals[GGML_OP_COUNT]) {
27842793 LLAMA_LOG_TSAVORITE (" \n === GGML Perf Summary ===\n " );
27852794 LLAMA_LOG_TSAVORITE (" %-16s %-8s %7s %14s %16s\n " , " Op" , " Target" , " Runs" , " Total us" , " Avg us" );
@@ -2843,7 +2852,7 @@ void llama_perf_context_print(const llama_context * ctx) {
28432852 __func__, data.t_eval_ms , data.n_eval , data.t_eval_ms / data.n_eval , 1e3 / data.t_eval_ms * data.n_eval );
28442853 LLAMA_LOG_INFO (" %s: total time = %10.2f ms / %5d tokens\n " , __func__, (t_end_ms - data.t_start_ms ), (data.n_p_eval + data.n_eval ));
28452854
2846- #ifdef GGML_PERF
2855+ #if defined( GGML_PERF) || defined(GGML_PERF_DETAIL)
28472856 LLAMA_LOG_TSAVORITE (" \n %s: load time = %10.2f ms\n " , __func__, data.t_load_ms );
28482857 LLAMA_LOG_TSAVORITE (" %s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n " ,
28492858 __func__, data.t_p_eval_ms , data.n_p_eval , data.t_p_eval_ms / data.n_p_eval , 1e3 / data.t_p_eval_ms * data.n_p_eval );
@@ -2852,7 +2861,7 @@ void llama_perf_context_print(const llama_context * ctx) {
28522861 LLAMA_LOG_TSAVORITE (" %s: total time = %10.2f ms / %5d tokens\n " , __func__, (t_end_ms - data.t_start_ms ), (data.n_p_eval + data.n_eval ));
28532862
28542863 ggml_perf_print_totals (const_cast <ggml_perf_totals *>(ctx->perf_totals ));
2855- #endif /* GGML_PERF */
2864+ #endif /* GGML_PERF || GGML_PERF_DETAIL */
28562865}
28572866
28582867void llama_perf_context_reset (llama_context * ctx) {
0 commit comments