@@ -1621,6 +1621,7 @@ struct server_slot {
16211621 common_speculative * spec = nullptr ;
16221622
16231623 std::unique_ptr<const server_task> task;
1624+ std::unique_ptr<const server_task> task_prev; // used for debugging
16241625
16251626 // used to determine the slot that has been used the longest
16261627 int64_t t_last_used = -1 ;
@@ -1739,6 +1740,7 @@ struct server_slot {
17391740 n_draft_accepted = 0 ;
17401741
17411742 task.reset ();
1743+ task_prev.reset ();
17421744
17431745 // clear alora start
17441746 alora_invocation_start = -1 ;
@@ -1813,6 +1815,8 @@ struct server_slot {
18131815 t_last_used = ggml_time_us ();
18141816 t_token_generation = (ggml_time_us () - t_start_generation) / 1e3 ;
18151817 state = SLOT_STATE_IDLE;
1818+
1819+ task_prev = std::move (task);
18161820 task.reset ();
18171821
18181822 callback_on_release (id);
@@ -1924,11 +1928,13 @@ struct server_slot {
19241928 {" n_ctx" , n_ctx},
19251929 {" speculative" , can_speculate ()},
19261930 {" is_processing" , is_processing ()},
1927- {" id_task" , task ? task->id : -1 },
19281931 };
19291932
1930- if (task) {
1931- res[" params" ] = task->params .to_json (only_metrics);
1933+ const auto & ptask = task ? task : task_prev;
1934+
1935+ if (ptask) {
1936+ res[" id_task" ] = ptask->id ;
1937+ res[" params" ] = ptask->params .to_json (only_metrics);
19321938 res[" next_token" ] = {
19331939 {
19341940 {" has_next_token" , has_next_token},
@@ -1939,7 +1945,8 @@ struct server_slot {
19391945 };
19401946
19411947 if (!only_metrics) {
1942- res[" prompt" ] = task->tokens .detokenize (ctx, true );
1948+ res[" prompt" ] = ptask->tokens .detokenize (ctx, true );
1949+ res[" generated" ] = generated_text;
19431950 }
19441951 }
19451952
@@ -2335,6 +2342,8 @@ struct server_context {
23352342 // slots / clients
23362343 std::vector<server_slot> slots;
23372344
2345+ int slots_debug = 0 ;
2346+
23382347 server_queue queue_tasks;
23392348 server_response queue_results;
23402349
@@ -2527,6 +2536,15 @@ struct server_context {
25272536 slots.push_back (std::move (slot));
25282537 }
25292538
2539+ {
2540+ const char * LLAMA_SERVER_SLOTS_DEBUG = getenv (" LLAMA_SERVER_SLOTS_DEBUG" );
2541+ slots_debug = LLAMA_SERVER_SLOTS_DEBUG ? atoi (LLAMA_SERVER_SLOTS_DEBUG) : 0 ;
2542+
2543+ if (slots_debug) {
2544+ SRV_WRN (" slots debug = %d\n " , slots_debug);
2545+ }
2546+ }
2547+
25302548 // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
25312549 // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
25322550 {
@@ -3331,7 +3349,7 @@ struct server_context {
33313349 int n_processing_slots = 0 ;
33323350
33333351 for (server_slot & slot : slots) {
3334- json slot_data = slot.to_json (true );
3352+ json slot_data = slot.to_json (slots_debug == 0 );
33353353
33363354 if (slot.is_processing ()) {
33373355 n_processing_slots++;
@@ -4578,18 +4596,18 @@ int main(int argc, char ** argv) {
45784596 }
45794597
45804598 // TODO: get rid of this dynamic_cast
4581- auto res_metrics = dynamic_cast <server_task_result_metrics*>(result.get ());
4582- GGML_ASSERT (res_metrics != nullptr );
4599+ auto res_task = dynamic_cast <server_task_result_metrics*>(result.get ());
4600+ GGML_ASSERT (res_task != nullptr );
45834601
45844602 // optionally return "fail_on_no_slot" error
45854603 if (req.has_param (" fail_on_no_slot" )) {
4586- if (res_metrics ->n_idle_slots == 0 ) {
4604+ if (res_task ->n_idle_slots == 0 ) {
45874605 res_error (res, format_error_response (" no slot available" , ERROR_TYPE_UNAVAILABLE));
45884606 return ;
45894607 }
45904608 }
45914609
4592- res_ok (res, res_metrics ->slots_data );
4610+ res_ok (res, res_task ->slots_data );
45934611 };
45944612
45954613 const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
@@ -4617,56 +4635,56 @@ int main(int argc, char ** argv) {
46174635 }
46184636
46194637 // TODO: get rid of this dynamic_cast
4620- auto res_metrics = dynamic_cast <server_task_result_metrics*>(result.get ());
4621- GGML_ASSERT (res_metrics != nullptr );
4638+ auto res_task = dynamic_cast <server_task_result_metrics*>(result.get ());
4639+ GGML_ASSERT (res_task != nullptr );
46224640
46234641 // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
46244642 json all_metrics_def = json {
46254643 {" counter" , {{
46264644 {" name" , " prompt_tokens_total" },
46274645 {" help" , " Number of prompt tokens processed." },
4628- {" value" , (uint64_t ) res_metrics ->n_prompt_tokens_processed_total }
4646+ {" value" , (uint64_t ) res_task ->n_prompt_tokens_processed_total }
46294647 }, {
46304648 {" name" , " prompt_seconds_total" },
46314649 {" help" , " Prompt process time" },
4632- {" value" , (uint64_t ) res_metrics ->t_prompt_processing_total / 1 .e3 }
4650+ {" value" , (uint64_t ) res_task ->t_prompt_processing_total / 1 .e3 }
46334651 }, {
46344652 {" name" , " tokens_predicted_total" },
46354653 {" help" , " Number of generation tokens processed." },
4636- {" value" , (uint64_t ) res_metrics ->n_tokens_predicted_total }
4654+ {" value" , (uint64_t ) res_task ->n_tokens_predicted_total }
46374655 }, {
46384656 {" name" , " tokens_predicted_seconds_total" },
46394657 {" help" , " Predict process time" },
4640- {" value" , (uint64_t ) res_metrics ->t_tokens_generation_total / 1 .e3 }
4658+ {" value" , (uint64_t ) res_task ->t_tokens_generation_total / 1 .e3 }
46414659 }, {
46424660 {" name" , " n_decode_total" },
46434661 {" help" , " Total number of llama_decode() calls" },
4644- {" value" , res_metrics ->n_decode_total }
4662+ {" value" , res_task ->n_decode_total }
46454663 }, {
46464664 {" name" , " n_past_max" },
46474665 {" help" , " Largest observed n_past." },
4648- {" value" , res_metrics ->n_past_max }
4666+ {" value" , res_task ->n_past_max }
46494667 }, {
46504668 {" name" , " n_busy_slots_per_decode" },
46514669 {" help" , " Average number of busy slots per llama_decode() call" },
4652- {" value" , (float ) res_metrics ->n_busy_slots_total / std::max ((float ) res_metrics ->n_decode_total , 1 .f )}
4670+ {" value" , (float ) res_task ->n_busy_slots_total / std::max ((float ) res_task ->n_decode_total , 1 .f )}
46534671 }}},
46544672 {" gauge" , {{
46554673 {" name" , " prompt_tokens_seconds" },
46564674 {" help" , " Average prompt throughput in tokens/s." },
4657- {" value" , res_metrics ->n_prompt_tokens_processed ? 1 .e3 / res_metrics ->t_prompt_processing * res_metrics ->n_prompt_tokens_processed : 0 .}
4675+ {" value" , res_task ->n_prompt_tokens_processed ? 1 .e3 / res_task ->t_prompt_processing * res_task ->n_prompt_tokens_processed : 0 .}
46584676 },{
46594677 {" name" , " predicted_tokens_seconds" },
46604678 {" help" , " Average generation throughput in tokens/s." },
4661- {" value" , res_metrics ->n_tokens_predicted ? 1 .e3 / res_metrics ->t_tokens_generation * res_metrics ->n_tokens_predicted : 0 .}
4679+ {" value" , res_task ->n_tokens_predicted ? 1 .e3 / res_task ->t_tokens_generation * res_task ->n_tokens_predicted : 0 .}
46624680 },{
46634681 {" name" , " requests_processing" },
46644682 {" help" , " Number of requests processing." },
4665- {" value" , (uint64_t ) res_metrics ->n_processing_slots }
4683+ {" value" , (uint64_t ) res_task ->n_processing_slots }
46664684 },{
46674685 {" name" , " requests_deferred" },
46684686 {" help" , " Number of requests deferred." },
4669- {" value" , (uint64_t ) res_metrics ->n_tasks_deferred }
4687+ {" value" , (uint64_t ) res_task ->n_tasks_deferred }
46704688 }}}
46714689 };
46724690
@@ -4687,7 +4705,7 @@ int main(int argc, char ** argv) {
46874705 }
46884706 }
46894707
4690- res.set_header (" Process-Start-Time-Unix" , std::to_string (res_metrics ->t_start ));
4708+ res.set_header (" Process-Start-Time-Unix" , std::to_string (res_task ->t_start ));
46914709
46924710 res.set_content (prometheus.str (), " text/plain; version=0.0.4" );
46934711 res.status = 200 ; // HTTP OK
0 commit comments