@@ -1303,16 +1303,19 @@ struct server_context {
13031303 }
13041304
13051305 void send_partial_response (server_slot & slot, completion_token_output tkn) {
1306+ const double t_token_generation = (ggml_time_us () - slot.t_start_generation ) / 1e3 ;
1307+ const double n_gen_second = 1e3 / t_token_generation * slot.n_decoded ;
13061308 server_task_result res;
13071309 res.id = slot.id_task ;
13081310 res.error = false ;
13091311 res.stop = false ;
13101312 res.data = json {
1311- {" content" , tkn.text_to_send },
1312- {" stop" , false },
1313- {" id_slot" , slot.id },
1314- {" multimodal" , false },
1315- {" index" , slot.index },
1313+ {" content" , tkn.text_to_send },
1314+ {" stop" , false },
1315+ {" id_slot" , slot.id },
1316+ {" multimodal" , false },
1317+ {" index" , slot.index },
1318+ {" n_gen_second" , n_gen_second},
13161319 };
13171320
13181321 if (slot.params .sampling .n_probs > 0 ) {
@@ -1340,6 +1343,8 @@ struct server_context {
13401343 }
13411344
13421345 void send_final_response (const server_slot & slot) {
1346+ const double n_prompt_second = 1e3 / slot.t_prompt_processing * slot.n_prompt_tokens_processed ;
1347+ const double n_gen_second = 1e3 / slot.t_token_generation * slot.n_decoded ;
13431348 server_task_result res;
13441349 res.id = slot.id_task ;
13451350 res.error = false ;
@@ -1351,6 +1356,8 @@ struct server_context {
13511356 {" model" , params_base.model_alias },
13521357 {" tokens_predicted" , slot.n_decoded },
13531358 {" tokens_evaluated" , slot.n_prompt_tokens },
1359+ {" n_prompt_second" , n_prompt_second},
1360+ {" n_gen_second" , n_gen_second},
13541361 {" generation_settings" , get_formated_generation (slot)},
13551362 {" prompt" , common_detokenize (ctx, slot.prompt_tokens )},
13561363 {" has_new_line" , slot.has_new_line },
0 commit comments