Apply suggestions from code review

ngxson · ggerganov · web-flow · commit 987955f06b83 · 2025-05-19T15:32:25.000+02:00
Co-authored-by: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1308,7 +1308,7 @@ struct server_slot {
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
 
-    size_t n_prompt_processing = 0; // number of decoded prompt tokens (may be less than prompt_tokens.n_kv_tokens(), in case we are using cache)
+    size_t n_prompt_processing = 0; // number of decoded prompt tokens (may be less than prompt_tokens.n_tokens(), in case we are using cache)
     double t_prompt_processing; // ms
     double t_token_generation;  // ms
 
@@ -2476,7 +2476,7 @@ struct server_context {
         res->truncated           = slot.truncated;
         res->n_decoded           = slot.n_decoded;
         res->n_prompt_tokens     = slot.n_prompt_tokens();
-        res->n_tokens_cached     = slot.n_past;
+        res->n_tokens_cached     = slot.n_cache_tokens();
         res->has_new_line        = slot.has_new_line;
         res->stopping_word       = slot.stopping_word;
         res->stop                = slot.stop;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
@@ -1045,7 +1045,6 @@ struct server_tokens {
     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
     // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
     // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
-    // in otherwords, tokens.size() == n_past
     llama_tokens tokens;
 
     // for ex. with input of 5 text tokens and 2 images: