Skip to content

Commit 15b202b

Browse files
committed
cont : fix speculative decoding initialization
1 parent 42e9fe8 commit 15b202b

File tree

1 file changed

+8
-6
lines changed

1 file changed

+8
-6
lines changed

tools/server/server.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2385,6 +2385,10 @@ struct server_context {
23852385
llama_batch_free(batch);
23862386
}
23872387

2388+
int32_t n_ctx_slot() const {
2389+
return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
2390+
}
2391+
23882392
bool load_model(const common_params & params) {
23892393
SRV_INF("loading model '%s'\n", params.model.path.c_str());
23902394

@@ -2413,7 +2417,7 @@ struct server_context {
24132417

24142418
params_dft.devices = params_base.speculative.devices;
24152419
params_dft.model = params_base.speculative.model;
2416-
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx;
2420+
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx;
24172421
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
24182422
params_dft.n_parallel = 1;
24192423
params_dft.cache_type_k = params_base.speculative.cache_type_k;
@@ -2501,16 +2505,14 @@ struct server_context {
25012505
}
25022506

25032507
void init() {
2504-
const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
2505-
25062508
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
25072509

25082510
for (int i = 0; i < params_base.n_parallel; i++) {
25092511
server_slot slot;
25102512

25112513
slot.id = i;
25122514
slot.ctx = ctx;
2513-
slot.n_ctx = n_ctx_slot;
2515+
slot.n_ctx = n_ctx_slot();
25142516
slot.mctx = mctx;
25152517
slot.prompt.tokens.has_mtmd = mctx != nullptr;
25162518

@@ -2533,7 +2535,7 @@ struct server_context {
25332535
}
25342536
}
25352537

2536-
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
2538+
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
25372539

25382540
slot.callback_on_release = [this](int) {
25392541
queue_tasks.pop_deferred_task();
@@ -3697,7 +3699,7 @@ struct server_context {
36973699
slot.n_past = 0;
36983700
slot.state = SLOT_STATE_PROCESSING_PROMPT;
36993701

3700-
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n",
3702+
SLT_INF(slot, "new prompt, n_ctx = %d, n_keep = %d, n_prompt_tokens = %d\n",
37013703
slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens());
37023704

37033705
// print prompt tokens (for debugging)

0 commit comments

Comments
 (0)