Skip to content

Commit 5b0d207

Browse files
committed
server : handle state restore fails
1 parent 487b922 commit 5b0d207

File tree

1 file changed

+21
-11
lines changed

1 file changed

+21
-11
lines changed

tools/server/server.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3314,30 +3314,40 @@ struct server_context {
33143314
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
33153315

33163316
if (pos_min > pos_min_thold) {
3317+
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
3318+
33173319
// search for a SWA checkpoint
3318-
auto it = std::find_if(
3320+
const auto it = std::find_if(
33193321
slot.swa_checkpoints.rbegin(),
33203322
slot.swa_checkpoints.rend(),
33213323
[&](const auto & cur) {
33223324
return cur.pos_min <= pos_min_thold;
33233325
}
33243326
);
33253327

3326-
if (it == slot.swa_checkpoints.rend()) {
3327-
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
3328-
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
3329-
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
3328+
bool do_reset = it == slot.swa_checkpoints.rend();
33303329

3331-
slot.n_past = 0;
3332-
slot.swa_checkpoints.clear();
3333-
} else {
3330+
if (!do_reset) {
33343331
// restore the checkpoint
33353332
const size_t swa_size = it->data.size();
3336-
llama_state_seq_set_data(ctx, it->data.data(), swa_size, slot.id);
3333+
const size_t n = llama_state_seq_set_data(ctx, it->data.data(), swa_size, slot.id);
3334+
3335+
if (n != swa_size) {
3336+
SLT_ERR(slot, "failed to restore SWA checkpoint, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
3337+
do_reset = true;
3338+
} else {
3339+
slot.n_past = std::min(slot.n_past, it->pos_max);
33373340

3338-
slot.n_past = std::min(slot.n_past, it->pos_max);
3341+
SLT_WRN(slot, "SWA checkpoint restore, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
3342+
}
3343+
}
33393344

3340-
SLT_WRN(slot, "SWA checkpoint restore, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
3345+
if (do_reset) {
3346+
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
3347+
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
3348+
3349+
slot.n_past = 0;
3350+
slot.swa_checkpoints.clear();
33413351
}
33423352
}
33433353
}

0 commit comments

Comments
 (0)