Skip to content

Commit 41a8e85

Browse files
committed
Remove redundant draft_accept_ratio var
1 parent 2dc2918 commit 41a8e85

File tree

1 file changed

+7
-14
lines changed

1 file changed

+7
-14
lines changed

examples/server/server.cpp

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -491,8 +491,7 @@ struct result_timings {
491491

492492
// Optional speculative metrics - only included when > 0
493493
int32_t draft_n = 0;
494-
int32_t draft_accepted_n = 0;
495-
double draft_accept_ratio = 0;
494+
int32_t draft_n_accepted = 0;
496495

497496
json to_json() const {
498497
json base = {
@@ -509,8 +508,7 @@ struct result_timings {
509508

510509
if (draft_n > 0) {
511510
base["draft_n"] = draft_n;
512-
base["draft_accepted_n"] = draft_accepted_n;
513-
base["draft_accept_ratio"] = draft_accept_ratio;
511+
base["draft_n_accepted"] = draft_n_accepted;
514512
}
515513

516514
return base;
@@ -1315,7 +1313,6 @@ struct server_slot {
13151313
// Speculative decoding stats
13161314
int32_t n_draft_total = 0; // Total draft tokens generated
13171315
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
1318-
float draft_accept_ratio = 0.f; // n_draft_accepted/n_draft_total
13191316

13201317
void reset() {
13211318
SLT_DBG(*this, "%s", "\n");
@@ -1337,7 +1334,6 @@ struct server_slot {
13371334
// clear speculative decoding stats
13381335
n_draft_total = 0;
13391336
n_draft_accepted = 0;
1340-
draft_accept_ratio = 0.f;
13411337
}
13421338

13431339
bool is_non_causal() const {
@@ -1407,8 +1403,7 @@ struct server_slot {
14071403
// Add speculative metrics
14081404
if (n_draft_total > 0) {
14091405
timings.draft_n = n_draft_total;
1410-
timings.draft_accepted_n = n_draft_accepted;
1411-
timings.draft_accept_ratio = draft_accept_ratio;
1406+
timings.draft_n_accepted = n_draft_accepted;
14121407
}
14131408

14141409
return timings;
@@ -3320,6 +3315,7 @@ struct server_context {
33203315

33213316
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
33223317

3318+
// keep track of total number of tokens generated in the draft
33233319
slot.n_draft_total += draft.size();
33243320

33253321
// ignore small drafts
@@ -3347,6 +3343,9 @@ struct server_context {
33473343
slot.n_past += ids.size();
33483344
slot.n_decoded += ids.size();
33493345

3346+
// update how many tokens out of draft was accepted
3347+
slot.n_draft_accepted += ids.size() - 1;
3348+
33503349
slot.cache_tokens.push_back(id);
33513350
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
33523351

@@ -3371,12 +3370,6 @@ struct server_context {
33713370
}
33723371
}
33733372

3374-
// Update speculative metrics
3375-
slot.n_draft_accepted += ids.size() - 1; // exclude last sampled token
3376-
if (slot.n_draft_total > 0) {
3377-
slot.draft_accept_ratio = (float)slot.n_draft_accepted / slot.n_draft_total;
3378-
}
3379-
33803373
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
33813374
}
33823375
}

0 commit comments

Comments
 (0)