@@ -1308,11 +1308,12 @@ struct server_slot {
13081308 common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
13091309
13101310 // stats
1311- size_t n_sent_text = 0 ; // number of sent text character
1311+ size_t n_sent_text = 0 ; // number of sent text character
13121312
13131313 int64_t t_start_process_prompt;
13141314 int64_t t_start_generation;
13151315
1316+ size_t n_prompt_processing = 0 ; // number of decoded prompt tokens (may be less than prompt_tokens.n_kv_tokens(), in case we are using cache)
13161317 double t_prompt_processing; // ms
13171318 double t_token_generation; // ms
13181319
@@ -1334,6 +1335,7 @@ struct server_slot {
13341335 stopping_word = " " ;
13351336 n_past = 0 ;
13361337 n_sent_text = 0 ;
1338+ n_prompt_processing = 0 ;
13371339 task_type = SERVER_TASK_TYPE_COMPLETION;
13381340
13391341 generated_tokens.clear ();
@@ -1402,10 +1404,10 @@ struct server_slot {
14021404
14031405 result_timings get_timings () const {
14041406 result_timings timings;
1405- timings.prompt_n = prompt_tokens. n_kv_tokens () ;
1407+ timings.prompt_n = n_prompt_processing ;
14061408 timings.prompt_ms = t_prompt_processing;
1407- timings.prompt_per_token_ms = t_prompt_processing / prompt_tokens. n_kv_tokens () ;
1408- timings.prompt_per_second = 1e3 / t_prompt_processing * prompt_tokens. n_kv_tokens () ;
1409+ timings.prompt_per_token_ms = t_prompt_processing / n_prompt_processing ;
1410+ timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_processing ;
14091411
14101412 timings.predicted_n = n_decoded;
14111413 timings.predicted_ms = t_token_generation;
@@ -3212,8 +3214,9 @@ struct server_context {
32123214 slot.cache_tokens .push_back (chunk.get ()); // copy
32133215 }
32143216
3215- slot.n_past += n_pos;
3216- slot.n_kv_tokens += n_tok;
3217+ slot.n_past += n_pos;
3218+ slot.n_kv_tokens += n_tok;
3219+ slot.n_prompt_processing += n_tok; // for stats only
32173220 }
32183221
32193222 // add prompt tokens for processing in the current batch
@@ -3233,6 +3236,7 @@ struct server_context {
32333236
32343237 slot.n_kv_tokens ++;
32353238 slot.n_past ++;
3239+ slot.n_prompt_processing ++; // for stats only
32363240 }
32373241
32383242 // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
0 commit comments