fixed mtp kv cache update sequencing after prompt processing

F1LM1 · F1LM1 · commit 382135aa3619 · 2025-08-17T21:54:45.000-04:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3543,18 +3543,19 @@ struct server_context {
 
                 const int tok_idx = slot.i_batch - i;
 
-                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
-                if (slot.has_mtp) {
-                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
-                }
-
                 llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
                 slot.last_tok_idx = tok_idx;
+                SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());
 
                 slot.i_batch = -1;
 
                 common_sampler_accept(slot.smpl, id, true);
 
+                // This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
+                if (slot.has_mtp) {
+                    mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
+                }
+
                 slot.n_decoded += 1;
 
                 const int64_t t_current = ggml_time_us();