Skip to content

Commit e5155e6

Browse files
authored
server : export max observed n_past value (#15361)
Add tracking for high watermark cache usage and make it available in /metrics endpoint. Use-case: Tracking largest needed cache usage under realistic workload to better understand memory requirements and be able to adjust cache size/quantization for model/cache accordingly.
1 parent 21c17b5 commit e5155e6

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

tools/server/server.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
12011201
uint64_t n_tokens_predicted_total = 0;
12021202
uint64_t t_tokens_generation_total = 0;
12031203

1204+
uint64_t n_past_max = 0;
1205+
12041206
uint64_t n_prompt_tokens_processed = 0;
12051207
uint64_t t_prompt_processing = 0;
12061208

@@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
12261228
{ "n_tokens_predicted_total", n_tokens_predicted_total },
12271229
{ "t_prompt_processing_total", t_prompt_processing_total },
12281230

1231+
{ "n_past_max", n_past_max },
1232+
12291233
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
12301234
{ "t_prompt_processing", t_prompt_processing },
12311235
{ "n_tokens_predicted", n_tokens_predicted },
@@ -1587,6 +1591,8 @@ struct server_metrics {
15871591
uint64_t n_tokens_predicted_total = 0;
15881592
uint64_t t_tokens_generation_total = 0;
15891593

1594+
uint64_t n_past_max = 0;
1595+
15901596
uint64_t n_prompt_tokens_processed = 0;
15911597
uint64_t t_prompt_processing = 0;
15921598

@@ -1605,6 +1611,10 @@ struct server_metrics {
16051611
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
16061612
t_prompt_processing += slot.t_prompt_processing;
16071613
t_prompt_processing_total += slot.t_prompt_processing;
1614+
1615+
if (slot.n_past > 0) {
1616+
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
1617+
}
16081618
}
16091619

16101620
void on_prediction(const server_slot & slot) {
@@ -1620,6 +1630,9 @@ struct server_metrics {
16201630
if (slot.is_processing()) {
16211631
n_busy_slots_total++;
16221632
}
1633+
if (slot.n_past > 0) {
1634+
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
1635+
}
16231636
}
16241637
}
16251638

@@ -2875,6 +2888,8 @@ struct server_context {
28752888
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
28762889
res->t_tokens_generation_total = metrics.t_tokens_generation_total;
28772890

2891+
res->n_past_max = metrics.n_past_max;
2892+
28782893
res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
28792894
res->t_prompt_processing = metrics.t_prompt_processing;
28802895
res->n_tokens_predicted = metrics.n_tokens_predicted;
@@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
40774092
{"name", "n_decode_total"},
40784093
{"help", "Total number of llama_decode() calls"},
40794094
{"value", res_metrics->n_decode_total}
4095+
}, {
4096+
{"name", "n_past_max"},
4097+
{"help", "Largest observed n_past."},
4098+
{"value", res_metrics->n_past_max}
40804099
}, {
40814100
{"name", "n_busy_slots_per_decode"},
40824101
{"help", "Average number of busy slots per llama_decode() call"},

0 commit comments

Comments
 (0)