From 2dc29181e4fb1cd333526d834fc944991d6dfc42 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Wed, 26 Mar 2025 23:04:19 -0700 Subject: [PATCH 1/3] Include speculative decoding stats when timings_per_token is true New fields added to the `timings` object: - draft_n : number of draft tokens generated - draft_accepted_n : number of draft tokens accepted - draft_accept_ratio: ratio of accepted/generated --- examples/server/server.cpp | 40 +++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 18caa9127662d..469f8ec48032b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -489,8 +489,13 @@ struct result_timings { double predicted_per_token_ms; double predicted_per_second; + // Optional speculative metrics - only included when > 0 + int32_t draft_n = 0; + int32_t draft_accepted_n = 0; + double draft_accept_ratio = 0; + json to_json() const { - return { + json base = { {"prompt_n", prompt_n}, {"prompt_ms", prompt_ms}, {"prompt_per_token_ms", prompt_per_token_ms}, @@ -501,6 +506,14 @@ struct result_timings { {"predicted_per_token_ms", predicted_per_token_ms}, {"predicted_per_second", predicted_per_second}, }; + + if (draft_n > 0) { + base["draft_n"] = draft_n; + base["draft_accepted_n"] = draft_accepted_n; + base["draft_accept_ratio"] = draft_accept_ratio; + } + + return base; } }; @@ -1299,6 +1312,11 @@ struct server_slot { std::function callback_on_release; + // Speculative decoding stats + int32_t n_draft_total = 0; // Total draft tokens generated + int32_t n_draft_accepted = 0; // Draft tokens actually accepted + float draft_accept_ratio = 0.f; // n_draft_accepted/n_draft_total + void reset() { SLT_DBG(*this, "%s", "\n"); @@ -1315,6 +1333,11 @@ struct server_slot { generated_tokens.clear(); generated_token_probs.clear(); + + // clear speculative decoding stats + n_draft_total = 0; + n_draft_accepted = 0; + draft_accept_ratio = 0.f; } bool is_non_causal() const { @@ -1381,6 +1404,13 @@ struct server_slot { timings.predicted_per_token_ms = t_token_generation / n_decoded; timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; + // Add speculative metrics + if (n_draft_total > 0) { + timings.draft_n = n_draft_total; + timings.draft_accepted_n = n_draft_accepted; + timings.draft_accept_ratio = draft_accept_ratio; + } + return timings; } @@ -3290,6 +3320,8 @@ struct server_context { llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); + slot.n_draft_total += draft.size(); + // ignore small drafts if (slot.params.speculative.n_min > (int) draft.size()) { SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); @@ -3339,6 +3371,12 @@ struct server_context { } } + // Update speculative metrics + slot.n_draft_accepted += ids.size() - 1; // exclude last sampled token + if (slot.n_draft_total > 0) { + slot.draft_accept_ratio = (float)slot.n_draft_accepted / slot.n_draft_total; + } + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); } } From 41a8e85d0f598922830d42b8c3b976a13d2650a8 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Thu, 27 Mar 2025 13:18:52 -0700 Subject: [PATCH 2/3] Remove redundant draft_accept_ratio var --- examples/server/server.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 469f8ec48032b..f61b14a1f7805 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -491,8 +491,7 @@ struct result_timings { // Optional speculative metrics - only included when > 0 int32_t draft_n = 0; - int32_t draft_accepted_n = 0; - double draft_accept_ratio = 0; + int32_t draft_n_accepted = 0; json to_json() const { json base = { @@ -509,8 +508,7 @@ struct result_timings { if (draft_n > 0) { base["draft_n"] = draft_n; - base["draft_accepted_n"] = draft_accepted_n; - base["draft_accept_ratio"] = draft_accept_ratio; + base["draft_n_accepted"] = draft_n_accepted; } return base; @@ -1315,7 +1313,6 @@ struct server_slot { // Speculative decoding stats int32_t n_draft_total = 0; // Total draft tokens generated int32_t n_draft_accepted = 0; // Draft tokens actually accepted - float draft_accept_ratio = 0.f; // n_draft_accepted/n_draft_total void reset() { SLT_DBG(*this, "%s", "\n"); @@ -1337,7 +1334,6 @@ struct server_slot { // clear speculative decoding stats n_draft_total = 0; n_draft_accepted = 0; - draft_accept_ratio = 0.f; } bool is_non_causal() const { @@ -1407,8 +1403,7 @@ struct server_slot { // Add speculative metrics if (n_draft_total > 0) { timings.draft_n = n_draft_total; - timings.draft_accepted_n = n_draft_accepted; - timings.draft_accept_ratio = draft_accept_ratio; + timings.draft_n_accepted = n_draft_accepted; } return timings; @@ -3320,6 +3315,7 @@ struct server_context { llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); + // keep track of total number of tokens generated in the draft slot.n_draft_total += draft.size(); // ignore small drafts @@ -3347,6 +3343,9 @@ struct server_context { slot.n_past += ids.size(); slot.n_decoded += ids.size(); + // update how many tokens out of draft was accepted + slot.n_draft_accepted += ids.size() - 1; + slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); @@ -3371,12 +3370,6 @@ struct server_context { } } - // Update speculative metrics - slot.n_draft_accepted += ids.size() - 1; // exclude last sampled token - if (slot.n_draft_total > 0) { - slot.draft_accept_ratio = (float)slot.n_draft_accepted / slot.n_draft_total; - } - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); } } From 429820ec8f4d0e58f5a5cfef47c7d35357f043e3 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Thu, 27 Mar 2025 13:57:18 -0700 Subject: [PATCH 3/3] add draft acceptance rate to server console output --- examples/server/server.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f61b14a1f7805..12aa2a775f997 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1453,6 +1453,15 @@ struct server_slot { t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + + if (n_draft_total > 0) { + const float draft_ratio = (float) n_draft_accepted / n_draft_total; + SLT_INF(*this, + "\n" + "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total + ); + } } json to_json() const {