Skip to content

Commit c5e5167

Browse files
ggerganovngxson
andauthored
server : add option to debug the slot contents (#16482)
* server : add option to debug the slot contents * Update tools/server/server.cpp --------- Co-authored-by: Xuan-Son Nguyen <[email protected]>
1 parent b612f7f commit c5e5167

File tree

1 file changed

+41
-23
lines changed

1 file changed

+41
-23
lines changed

tools/server/server.cpp

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,7 @@ struct server_slot {
16211621
common_speculative * spec = nullptr;
16221622

16231623
std::unique_ptr<const server_task> task;
1624+
std::unique_ptr<const server_task> task_prev; // used for debugging
16241625

16251626
// used to determine the slot that has been used the longest
16261627
int64_t t_last_used = -1;
@@ -1739,6 +1740,7 @@ struct server_slot {
17391740
n_draft_accepted = 0;
17401741

17411742
task.reset();
1743+
task_prev.reset();
17421744

17431745
// clear alora start
17441746
alora_invocation_start = -1;
@@ -1813,6 +1815,8 @@ struct server_slot {
18131815
t_last_used = ggml_time_us();
18141816
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
18151817
state = SLOT_STATE_IDLE;
1818+
1819+
task_prev = std::move(task);
18161820
task.reset();
18171821

18181822
callback_on_release(id);
@@ -1924,11 +1928,13 @@ struct server_slot {
19241928
{"n_ctx", n_ctx},
19251929
{"speculative", can_speculate()},
19261930
{"is_processing", is_processing()},
1927-
{"id_task", task ? task->id : -1},
19281931
};
19291932

1930-
if (task) {
1931-
res["params"] = task->params.to_json(only_metrics);
1933+
const auto & ptask = task ? task : task_prev;
1934+
1935+
if (ptask) {
1936+
res["id_task"] = ptask->id;
1937+
res["params"] = ptask->params.to_json(only_metrics);
19321938
res["next_token"] = {
19331939
{
19341940
{"has_next_token", has_next_token},
@@ -1939,7 +1945,8 @@ struct server_slot {
19391945
};
19401946

19411947
if (!only_metrics) {
1942-
res["prompt"] = task->tokens.detokenize(ctx, true);
1948+
res["prompt"] = ptask->tokens.detokenize(ctx, true);
1949+
res["generated"] = generated_text;
19431950
}
19441951
}
19451952

@@ -2335,6 +2342,8 @@ struct server_context {
23352342
// slots / clients
23362343
std::vector<server_slot> slots;
23372344

2345+
int slots_debug = 0;
2346+
23382347
server_queue queue_tasks;
23392348
server_response queue_results;
23402349

@@ -2527,6 +2536,15 @@ struct server_context {
25272536
slots.push_back(std::move(slot));
25282537
}
25292538

2539+
{
2540+
const char * LLAMA_SERVER_SLOTS_DEBUG = getenv("LLAMA_SERVER_SLOTS_DEBUG");
2541+
slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi(LLAMA_SERVER_SLOTS_DEBUG) : 0;
2542+
2543+
if (slots_debug) {
2544+
SRV_WRN("slots debug = %d\n", slots_debug);
2545+
}
2546+
}
2547+
25302548
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
25312549
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
25322550
{
@@ -3331,7 +3349,7 @@ struct server_context {
33313349
int n_processing_slots = 0;
33323350

33333351
for (server_slot & slot : slots) {
3334-
json slot_data = slot.to_json(true);
3352+
json slot_data = slot.to_json(slots_debug == 0);
33353353

33363354
if (slot.is_processing()) {
33373355
n_processing_slots++;
@@ -4578,18 +4596,18 @@ int main(int argc, char ** argv) {
45784596
}
45794597

45804598
// TODO: get rid of this dynamic_cast
4581-
auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
4582-
GGML_ASSERT(res_metrics != nullptr);
4599+
auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
4600+
GGML_ASSERT(res_task != nullptr);
45834601

45844602
// optionally return "fail_on_no_slot" error
45854603
if (req.has_param("fail_on_no_slot")) {
4586-
if (res_metrics->n_idle_slots == 0) {
4604+
if (res_task->n_idle_slots == 0) {
45874605
res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
45884606
return;
45894607
}
45904608
}
45914609

4592-
res_ok(res, res_metrics->slots_data);
4610+
res_ok(res, res_task->slots_data);
45934611
};
45944612

45954613
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
@@ -4617,56 +4635,56 @@ int main(int argc, char ** argv) {
46174635
}
46184636

46194637
// TODO: get rid of this dynamic_cast
4620-
auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
4621-
GGML_ASSERT(res_metrics != nullptr);
4638+
auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
4639+
GGML_ASSERT(res_task != nullptr);
46224640

46234641
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
46244642
json all_metrics_def = json {
46254643
{"counter", {{
46264644
{"name", "prompt_tokens_total"},
46274645
{"help", "Number of prompt tokens processed."},
4628-
{"value", (uint64_t) res_metrics->n_prompt_tokens_processed_total}
4646+
{"value", (uint64_t) res_task->n_prompt_tokens_processed_total}
46294647
}, {
46304648
{"name", "prompt_seconds_total"},
46314649
{"help", "Prompt process time"},
4632-
{"value", (uint64_t) res_metrics->t_prompt_processing_total / 1.e3}
4650+
{"value", (uint64_t) res_task->t_prompt_processing_total / 1.e3}
46334651
}, {
46344652
{"name", "tokens_predicted_total"},
46354653
{"help", "Number of generation tokens processed."},
4636-
{"value", (uint64_t) res_metrics->n_tokens_predicted_total}
4654+
{"value", (uint64_t) res_task->n_tokens_predicted_total}
46374655
}, {
46384656
{"name", "tokens_predicted_seconds_total"},
46394657
{"help", "Predict process time"},
4640-
{"value", (uint64_t) res_metrics->t_tokens_generation_total / 1.e3}
4658+
{"value", (uint64_t) res_task->t_tokens_generation_total / 1.e3}
46414659
}, {
46424660
{"name", "n_decode_total"},
46434661
{"help", "Total number of llama_decode() calls"},
4644-
{"value", res_metrics->n_decode_total}
4662+
{"value", res_task->n_decode_total}
46454663
}, {
46464664
{"name", "n_past_max"},
46474665
{"help", "Largest observed n_past."},
4648-
{"value", res_metrics->n_past_max}
4666+
{"value", res_task->n_past_max}
46494667
}, {
46504668
{"name", "n_busy_slots_per_decode"},
46514669
{"help", "Average number of busy slots per llama_decode() call"},
4652-
{"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
4670+
{"value", (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
46534671
}}},
46544672
{"gauge", {{
46554673
{"name", "prompt_tokens_seconds"},
46564674
{"help", "Average prompt throughput in tokens/s."},
4657-
{"value", res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.}
4675+
{"value", res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
46584676
},{
46594677
{"name", "predicted_tokens_seconds"},
46604678
{"help", "Average generation throughput in tokens/s."},
4661-
{"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
4679+
{"value", res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
46624680
},{
46634681
{"name", "requests_processing"},
46644682
{"help", "Number of requests processing."},
4665-
{"value", (uint64_t) res_metrics->n_processing_slots}
4683+
{"value", (uint64_t) res_task->n_processing_slots}
46664684
},{
46674685
{"name", "requests_deferred"},
46684686
{"help", "Number of requests deferred."},
4669-
{"value", (uint64_t) res_metrics->n_tasks_deferred}
4687+
{"value", (uint64_t) res_task->n_tasks_deferred}
46704688
}}}
46714689
};
46724690

@@ -4687,7 +4705,7 @@ int main(int argc, char ** argv) {
46874705
}
46884706
}
46894707

4690-
res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start));
4708+
res.set_header("Process-Start-Time-Unix", std::to_string(res_task->t_start));
46914709

46924710
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
46934711
res.status = 200; // HTTP OK

0 commit comments

Comments
 (0)