server : cleanup and fixes

ggerganov · ggerganov · commit 6fc5bcdc78d3 · 2025-10-03T20:24:25.000+03:00
diff --git a/include/llama.h b/include/llama.h
@@ -794,7 +794,11 @@ extern "C" {
                           size_t   n_token_capacity,
                           size_t * n_token_count_out);
 
-#define LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY 1
+// for backwards-compat
+#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
+
+// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
+#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 
     typedef uint32_t llama_state_seq_flags;
 
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
@@ -220,15 +220,15 @@ bool llama_kv_cache_iswa::get_can_shift() const {
 }
 
 void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
-    if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
         kv_base->state_write(io, seq_id, flags);
     }
 
     kv_swa->state_write(io, seq_id, flags);
 }
 
 void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
-    if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
         kv_base->state_read(io, seq_id, flags);
     }
 
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
@@ -175,14 +175,14 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo
 }
 
 void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
-    if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
         mem_attn->state_write(io, seq_id, flags);
     }
     mem_recr->state_write(io, seq_id, flags);
 }
 
 void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
-    if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
         mem_attn->state_read(io, seq_id, flags);
     }
     mem_recr->state_read(io, seq_id, flags);
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -692,8 +692,6 @@ size_t llama_memory_recurrent::size_s_bytes() const {
 }
 
 void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
-    // the LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY flag is acknowledged but does not change
-    // behavior here, as there is no notion of a partial state for a recurrent context
     GGML_UNUSED(flags);
 
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
@@ -734,8 +732,6 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
 }
 
 void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
-    // the LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY flag is acknowledged but does not change
-    // behavior here, as there is no notion of a partial state for a recurrent context
     GGML_UNUSED(flags);
 
     uint32_t cell_count;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3573,7 +3573,7 @@ struct server_context {
                                     if (!do_reset) {
                                         // restore the context checkpoint
                                         const size_t ctx_checkpoint_size = it->data.size();
-                                        const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), ctx_checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY);
+                                        const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), ctx_checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
                                         if (n != ctx_checkpoint_size) {
                                             SLT_ERR(slot, "failed to restore context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) ctx_checkpoint_size / 1024 / 1024);
@@ -3598,7 +3598,7 @@ struct server_context {
                                 for (int i = (int) slot.ctx_checkpoints.size() - 1; i >= 0; i--) {
                                     const auto & cur = slot.ctx_checkpoints[i];
                                     if (cur.pos_min > pos_min_thold) {
-                                        SLT_WRN(slot, "erased invalidated context checkpoint for SWA (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
+                                        SLT_WRN(slot, "erased invalidated context checkpoint (pos_min = %d, pos_max = %d, n_swa = %d, size = %.3f MiB)\n", cur.pos_min, cur.pos_max, n_swa, (float) cur.data.size() / 1024 / 1024);
                                         slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin() + i);
                                     }
                                 }
@@ -3854,32 +3854,35 @@ struct server_context {
                     // prompt evaluated for next-token prediction
                     slot.state = SLOT_STATE_GENERATING;
 
-                    // make a checkpoint of the parts of memory that cannot be rolled back.
-                    // checkpoints are needed only if:
+                    // make a checkpoint of the parts of the memory that cannot be rolled back.
+                    // checkpoints are created only if:
                     // - the model uses SWA and we are not using `swa_full`
                     // - the model architecture is marked as recurrent or hybrid
-                    bool do_checkpoint = (llama_model_is_recurrent(model) || llama_model_is_hybrid(model)) ||
-                         (llama_model_n_swa(model) > 0 && !params_base.swa_full);
+                    //
+                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
+                    const bool do_checkpoint =
+                        (llama_model_is_recurrent(model) || llama_model_is_hybrid(model)) ||
+                        (llama_model_n_swa(model) > 0 && !params_base.swa_full);
 
                     if (do_checkpoint && params_base.n_ctx_checkpoints > 0) {
-                        if (slot.ctx_checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
+                        while (slot.ctx_checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
                             // make room for the new checkpoint, if needed
-                            const auto & cur = slot.ctx_checkpoints.back();
+                            const auto & cur = slot.ctx_checkpoints.front();
                             SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
                                     cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
 
                             slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin());
                         }
 
-                        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY);
+                        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
                         auto & cur = slot.ctx_checkpoints.emplace_back(ctx_checkpoint{
                             /*.pos_min = */ llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id),
                             /*.pos_max = */ llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id),
                             /*.data    = */ std::vector<uint8_t>(checkpoint_size),
                         });
 
-                        llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY);
+                        llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
                         SLT_WRN(slot, "saved context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
                                 (int) slot.ctx_checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);

Original file line number	Diff line number	Diff line change
`@@ -220,15 +220,15 @@ bool llama_kv_cache_iswa::get_can_shift() const {`
`220`	`220`	`}`
`221`	`221`
`222`	`222`	`void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {`
`223`		`- if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {`
	`223`	`+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {`
`224`	`224`	`kv_base->state_write(io, seq_id, flags);`
`225`	`225`	`}`
`226`	`226`
`227`	`227`	`kv_swa->state_write(io, seq_id, flags);`
`228`	`228`	`}`
`229`	`229`
`230`	`230`	`void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {`
`231`		`- if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {`
	`231`	`+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {`
`232`	`232`	`kv_base->state_read(io, seq_id, flags);`
`233`	`233`	`}`
`234`	`234`
Original file line number	Diff line number	Diff line change
`@@ -175,14 +175,14 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo`
`175`	`175`	`}`
`176`	`176`
`177`	`177`	`void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {`
`178`		`- if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {`
	`178`	`+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {`
`179`	`179`	`mem_attn->state_write(io, seq_id, flags);`
`180`	`180`	`}`
`181`	`181`	`mem_recr->state_write(io, seq_id, flags);`
`182`	`182`	`}`
`183`	`183`
`184`	`184`	`void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {`
`185`		`- if ((flags & LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY) == 0) {`
	`185`	`+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {`
`186`	`186`	`mem_attn->state_read(io, seq_id, flags);`
`187`	`187`	`}`
`188`	`188`	`mem_recr->state_read(io, seq_id, flags);`
Original file line number	Diff line number	Diff line change
`@@ -692,8 +692,6 @@ size_t llama_memory_recurrent::size_s_bytes() const {`
`692`	`692`	`}`
`693`	`693`
`694`	`694`	`void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {`
`695`		`- // the LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY flag is acknowledged but does not change`
`696`		`- // behavior here, as there is no notion of a partial state for a recurrent context`
`697`	`695`	`GGML_UNUSED(flags);`
`698`	`696`
`699`	`697`	`std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive`
`@@ -734,8 +732,6 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq`
`734`	`732`	`}`
`735`	`733`
`736`	`734`	`void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {`
`737`		`- // the LLAMA_STATE_SEQ_FLAGS_CHECKPOINT_ONLY flag is acknowledged but does not change`
`738`		`- // behavior here, as there is no notion of a partial state for a recurrent context`
`739`	`735`	`GGML_UNUSED(flags);`
`740`	`736`
`741`	`737`	`uint32_t cell_count;`