Remove redundant draft_accept_ratio var

mostlygeek · mostlygeek · commit 41a8e85d0f59 · 2025-03-27T13:18:52.000-07:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -491,8 +491,7 @@ struct result_timings {
 
     // Optional speculative metrics - only included when > 0
     int32_t draft_n = 0;
-    int32_t draft_accepted_n = 0;
-    double draft_accept_ratio = 0;
+    int32_t draft_n_accepted = 0;
 
     json to_json() const {
         json base = {
@@ -509,8 +508,7 @@ struct result_timings {
 
         if (draft_n > 0) {
             base["draft_n"] = draft_n;
-            base["draft_accepted_n"] = draft_accepted_n;
-            base["draft_accept_ratio"] = draft_accept_ratio;
+            base["draft_n_accepted"] = draft_n_accepted;
         }
 
         return base;
@@ -1315,7 +1313,6 @@ struct server_slot {
     // Speculative decoding stats
     int32_t n_draft_total = 0;      // Total draft tokens generated
     int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
-    float draft_accept_ratio = 0.f; // n_draft_accepted/n_draft_total
 
     void reset() {
         SLT_DBG(*this, "%s", "\n");
@@ -1337,7 +1334,6 @@ struct server_slot {
         // clear speculative decoding stats
         n_draft_total = 0;
         n_draft_accepted = 0;
-        draft_accept_ratio = 0.f;
     }
 
     bool is_non_causal() const {
@@ -1407,8 +1403,7 @@ struct server_slot {
         // Add speculative metrics
         if (n_draft_total > 0) {
             timings.draft_n = n_draft_total;
-            timings.draft_accepted_n = n_draft_accepted;
-            timings.draft_accept_ratio = draft_accept_ratio;
+            timings.draft_n_accepted = n_draft_accepted;
         }
 
         return timings;
@@ -3320,6 +3315,7 @@ struct server_context {
 
                 llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
 
+                // keep track of total number of tokens generated in the draft
                 slot.n_draft_total += draft.size();
 
                 // ignore small drafts
@@ -3347,6 +3343,9 @@ struct server_context {
                 slot.n_past    += ids.size();
                 slot.n_decoded += ids.size();
 
+                // update how many tokens out of draft was accepted
+                slot.n_draft_accepted += ids.size() - 1;
+
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
@@ -3371,12 +3370,6 @@ struct server_context {
                     }
                 }
 
-                // Update speculative metrics
-                slot.n_draft_accepted += ids.size() - 1; // exclude last sampled token
-                if (slot.n_draft_total > 0) {
-                    slot.draft_accept_ratio = (float)slot.n_draft_accepted / slot.n_draft_total;
-                }
-
                 SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
             }
         }