@@ -491,8 +491,7 @@ struct result_timings {
491491
492492 // Optional speculative metrics - only included when > 0
493493 int32_t draft_n = 0 ;
494- int32_t draft_accepted_n = 0 ;
495- double draft_accept_ratio = 0 ;
494+ int32_t draft_n_accepted = 0 ;
496495
497496 json to_json () const {
498497 json base = {
@@ -509,8 +508,7 @@ struct result_timings {
509508
510509 if (draft_n > 0 ) {
511510 base[" draft_n" ] = draft_n;
512- base[" draft_accepted_n" ] = draft_accepted_n;
513- base[" draft_accept_ratio" ] = draft_accept_ratio;
511+ base[" draft_n_accepted" ] = draft_n_accepted;
514512 }
515513
516514 return base;
@@ -1315,7 +1313,6 @@ struct server_slot {
13151313 // Speculative decoding stats
13161314 int32_t n_draft_total = 0 ; // Total draft tokens generated
13171315 int32_t n_draft_accepted = 0 ; // Draft tokens actually accepted
1318- float draft_accept_ratio = 0 .f; // n_draft_accepted/n_draft_total
13191316
13201317 void reset () {
13211318 SLT_DBG (*this , " %s" , " \n " );
@@ -1337,7 +1334,6 @@ struct server_slot {
13371334 // clear speculative decoding stats
13381335 n_draft_total = 0 ;
13391336 n_draft_accepted = 0 ;
1340- draft_accept_ratio = 0 .f ;
13411337 }
13421338
13431339 bool is_non_causal () const {
@@ -1407,8 +1403,7 @@ struct server_slot {
14071403 // Add speculative metrics
14081404 if (n_draft_total > 0 ) {
14091405 timings.draft_n = n_draft_total;
1410- timings.draft_accepted_n = n_draft_accepted;
1411- timings.draft_accept_ratio = draft_accept_ratio;
1406+ timings.draft_n_accepted = n_draft_accepted;
14121407 }
14131408
14141409 return timings;
@@ -3320,6 +3315,7 @@ struct server_context {
33203315
33213316 llama_tokens draft = common_speculative_gen_draft (slot.spec , params_spec, slot.cache_tokens , id);
33223317
3318+ // keep track of total number of tokens generated in the draft
33233319 slot.n_draft_total += draft.size ();
33243320
33253321 // ignore small drafts
@@ -3347,6 +3343,9 @@ struct server_context {
33473343 slot.n_past += ids.size ();
33483344 slot.n_decoded += ids.size ();
33493345
3346+ // update how many tokens out of draft was accepted
3347+ slot.n_draft_accepted += ids.size () - 1 ;
3348+
33503349 slot.cache_tokens .push_back (id);
33513350 slot.cache_tokens .insert (slot.cache_tokens .end (), ids.begin (), ids.end () - 1 );
33523351
@@ -3371,12 +3370,6 @@ struct server_context {
33713370 }
33723371 }
33733372
3374- // Update speculative metrics
3375- slot.n_draft_accepted += ids.size () - 1 ; // exclude last sampled token
3376- if (slot.n_draft_total > 0 ) {
3377- slot.draft_accept_ratio = (float )slot.n_draft_accepted / slot.n_draft_total ;
3378- }
3379-
33803373 SLT_DBG (slot, " accepted %d/%d draft tokens, new n_past = %d\n " , (int ) ids.size () - 1 , (int ) draft.size (), slot.n_past );
33813374 }
33823375 }
0 commit comments