Skip to content

Commit 382135a

Browse files
committed
fixed mtp kv cache update sequencing after prompt processing
1 parent 6870f97 commit 382135a

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

tools/server/server.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3543,18 +3543,19 @@ struct server_context {
35433543

35443544
const int tok_idx = slot.i_batch - i;
35453545

3546-
// This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
3547-
if (slot.has_mtp) {
3548-
mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
3549-
}
3550-
35513546
llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
35523547
slot.last_tok_idx = tok_idx;
3548+
SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());
35533549

35543550
slot.i_batch = -1;
35553551

35563552
common_sampler_accept(slot.smpl, id, true);
35573553

3554+
// This should only trigger on a non-empty update batch once, after prompt processing but not during token generation
3555+
if (slot.has_mtp) {
3556+
mtp_update_kv_cache(ctx, slot.mtp_kv_update_batch);
3557+
}
3558+
35583559
slot.n_decoded += 1;
35593560

35603561
const int64_t t_current = ggml_time_us();

0 commit comments

Comments
 (0)