server : fix checkpoint logic to support recurrent caches

ggerganov · ggerganov · commit 829c701afb86 · 2025-10-03T11:15:34.000+03:00
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3541,7 +3541,11 @@ struct server_context {
                                 slot.n_past = 0;
                             }
 
-                            const auto n_swa = llama_model_n_swa(model);
+                            // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
+                            const auto n_swa = std::max(1, llama_model_n_swa(model));
+
+                            // the largest pos_min required for a checkpoint to be useful
+                            const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
 
                             if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
@@ -3550,17 +3554,16 @@ struct server_context {
                                     GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
                                 }
 
-                                const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
-
-                                if (pos_min > pos_min_thold + 1) {
+                                if (pos_min > pos_min_thold) {
                                     SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
 
                                     // search for a context checkpoint
                                     const auto it = std::find_if(
                                         slot.ctx_checkpoints.rbegin(),
                                         slot.ctx_checkpoints.rend(),
                                         [&](const auto & cur) {
-                                            return cur.pos_min <= pos_min_thold;
+                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
+                                            return cur.pos_min < pos_min_thold;
                                         }
                                     );
 
@@ -3577,7 +3580,7 @@ struct server_context {
                                             do_reset = true;
                                             //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
                                         } else {
-                                            slot.n_past = std::min(slot.n_past, it->pos_max);
+                                            slot.n_past = std::min(slot.n_past, std::max(it->pos_min + 1, it->pos_max));
                                             SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) ctx_checkpoint_size / 1024 / 1024);
                                         }
                                     }
@@ -3586,25 +3589,23 @@ struct server_context {
                                         SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
                                                 "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
                                         slot.n_past = 0;
-                                        slot.ctx_checkpoints.clear();
                                     }
                                 }
                             }
 
-                            if (n_swa > 0) {
-                                const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
-
+                            {
                                 // erase any checkpoints with pos_min > pos_min_thold
                                 for (int i = (int) slot.ctx_checkpoints.size() - 1; i >= 0; i--) {
                                     const auto & cur = slot.ctx_checkpoints[i];
                                     if (cur.pos_min > pos_min_thold) {
-                                        slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);
                                         SLT_WRN(slot, "erased invalidated context checkpoint for SWA (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
+                                        slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);
                                     }
                                 }
                             }
                         }
 
+                        // [TAG_PROMPT_LOGITS]
                         if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
                             SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, n_prompt_tokens = %d)\n", slot.n_past, slot.n_prompt_tokens);
                             slot.n_past--;

Original file line number	Diff line number	Diff line change
`@@ -3541,7 +3541,11 @@ struct server_context {`
`3541`	`3541`	`slot.n_past = 0;`
`3542`	`3542`	`}`
`3543`	`3543`
`3544`		`- const auto n_swa = llama_model_n_swa(model);`
	`3544`	`+ // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1`
	`3545`	`+ const auto n_swa = std::max(1, llama_model_n_swa(model));`
	`3546`	`+`
	`3547`	`+ // the largest pos_min required for a checkpoint to be useful`
	`3548`	`+ const auto pos_min_thold = std::max(0, slot.n_past - n_swa);`
`3545`	`3549`
`3546`	`3550`	`if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {`
`3547`	`3551`	`const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);`
`@@ -3550,17 +3554,16 @@ struct server_context {`
`3550`	`3554`	`GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");`
`3551`	`3555`	`}`
`3552`	`3556`
`3553`		`- const auto pos_min_thold = std::max(0, slot.n_past - n_swa);`
`3554`		`-`
`3555`		`- if (pos_min > pos_min_thold + 1) {`
	`3557`	`+ if (pos_min > pos_min_thold) {`
`3556`	`3558`	`SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);`
`3557`	`3559`
`3558`	`3560`	`// search for a context checkpoint`
`3559`	`3561`	`const auto it = std::find_if(`
`3560`	`3562`	`slot.ctx_checkpoints.rbegin(),`
`3561`	`3563`	`slot.ctx_checkpoints.rend(),`
`3562`	`3564`	`[&](const auto & cur) {`
`3563`		`- return cur.pos_min <= pos_min_thold;`
	`3565`	`+ // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]`
	`3566`	`+ return cur.pos_min < pos_min_thold;`
`3564`	`3567`	`}`
`3565`	`3568`	`);`
`3566`	`3569`
`@@ -3577,7 +3580,7 @@ struct server_context {`
`3577`	`3580`	`do_reset = true;`
`3578`	`3581`	//printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
`3579`	`3582`	`} else {`
`3580`		`- slot.n_past = std::min(slot.n_past, it->pos_max);`
	`3583`	`+ slot.n_past = std::min(slot.n_past, std::max(it->pos_min + 1, it->pos_max));`
`3581`	`3584`	`SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) ctx_checkpoint_size / 1024 / 1024);`
`3582`	`3585`	`}`
`3583`	`3586`	`}`
`@@ -3586,25 +3589,23 @@ struct server_context {`
`3586`	`3589`	`SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",`
`3587`	`3590`	`"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");`
`3588`	`3591`	`slot.n_past = 0;`
`3589`		`- slot.ctx_checkpoints.clear();`
`3590`	`3592`	`}`
`3591`	`3593`	`}`
`3592`	`3594`	`}`
`3593`	`3595`
`3594`		`- if (n_swa > 0) {`
`3595`		`- const auto pos_min_thold = std::max(0, slot.n_past - n_swa);`
`3596`		`-`
	`3596`	`+ {`
`3597`	`3597`	`// erase any checkpoints with pos_min > pos_min_thold`
`3598`	`3598`	`for (int i = (int) slot.ctx_checkpoints.size() - 1; i >= 0; i--) {`
`3599`	`3599`	`const auto & cur = slot.ctx_checkpoints[i];`
`3600`	`3600`	`if (cur.pos_min > pos_min_thold) {`
`3601`		`- slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);`
`3602`	`3601`	`SLT_WRN(slot, "erased invalidated context checkpoint for SWA (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);`
	`3602`	`+ slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);`
`3603`	`3603`	`}`
`3604`	`3604`	`}`
`3605`	`3605`	`}`
`3606`	`3606`	`}`
`3607`	`3607`
	`3608`	`+ // [TAG_PROMPT_LOGITS]`
`3608`	`3609`	`if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {`
`3609`	`3610`	`SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, n_prompt_tokens = %d)\n", slot.n_past, slot.n_prompt_tokens);`
`3610`	`3611`	`slot.n_past--;`