@@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
1201
1201
uint64_t n_tokens_predicted_total = 0 ;
1202
1202
uint64_t t_tokens_generation_total = 0 ;
1203
1203
1204
+ uint64_t n_past_max = 0 ;
1205
+
1204
1206
uint64_t n_prompt_tokens_processed = 0 ;
1205
1207
uint64_t t_prompt_processing = 0 ;
1206
1208
@@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
1226
1228
{ " n_tokens_predicted_total" , n_tokens_predicted_total },
1227
1229
{ " t_prompt_processing_total" , t_prompt_processing_total },
1228
1230
1231
+ { " n_past_max" , n_past_max },
1232
+
1229
1233
{ " n_prompt_tokens_processed" , n_prompt_tokens_processed },
1230
1234
{ " t_prompt_processing" , t_prompt_processing },
1231
1235
{ " n_tokens_predicted" , n_tokens_predicted },
@@ -1587,6 +1591,8 @@ struct server_metrics {
1587
1591
uint64_t n_tokens_predicted_total = 0 ;
1588
1592
uint64_t t_tokens_generation_total = 0 ;
1589
1593
1594
+ uint64_t n_past_max = 0 ;
1595
+
1590
1596
uint64_t n_prompt_tokens_processed = 0 ;
1591
1597
uint64_t t_prompt_processing = 0 ;
1592
1598
@@ -1605,6 +1611,10 @@ struct server_metrics {
1605
1611
n_prompt_tokens_processed += slot.n_prompt_tokens_processed ;
1606
1612
t_prompt_processing += slot.t_prompt_processing ;
1607
1613
t_prompt_processing_total += slot.t_prompt_processing ;
1614
+
1615
+ if (slot.n_past > 0 ) {
1616
+ n_past_max = std::max (n_past_max, (uint64_t ) slot.n_past );
1617
+ }
1608
1618
}
1609
1619
1610
1620
void on_prediction (const server_slot & slot) {
@@ -1620,6 +1630,9 @@ struct server_metrics {
1620
1630
if (slot.is_processing ()) {
1621
1631
n_busy_slots_total++;
1622
1632
}
1633
+ if (slot.n_past > 0 ) {
1634
+ n_past_max = std::max (n_past_max, (uint64_t ) slot.n_past );
1635
+ }
1623
1636
}
1624
1637
}
1625
1638
@@ -2875,6 +2888,8 @@ struct server_context {
2875
2888
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total ;
2876
2889
res->t_tokens_generation_total = metrics.t_tokens_generation_total ;
2877
2890
2891
+ res->n_past_max = metrics.n_past_max ;
2892
+
2878
2893
res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed ;
2879
2894
res->t_prompt_processing = metrics.t_prompt_processing ;
2880
2895
res->n_tokens_predicted = metrics.n_tokens_predicted ;
@@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
4077
4092
{" name" , " n_decode_total" },
4078
4093
{" help" , " Total number of llama_decode() calls" },
4079
4094
{" value" , res_metrics->n_decode_total }
4095
+ }, {
4096
+ {" name" , " n_past_max" },
4097
+ {" help" , " Largest observed n_past." },
4098
+ {" value" , res_metrics->n_past_max }
4080
4099
}, {
4081
4100
{" name" , " n_busy_slots_per_decode" },
4082
4101
{" help" , " Average number of busy slots per llama_decode() call" },
0 commit comments