Skip to content

Commit 11b4d58

Browse files
committed
server : various params fixes
ggml-ci
1 parent f325205 commit 11b4d58

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

examples/server/server.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -694,12 +694,9 @@ struct server_context {
694694

695695
params_dft.devices = params_base.speculative.devices;
696696
params_dft.model = params_base.speculative.model;
697-
params_dft.n_ctx = params_base.speculative.n_ctx;
697+
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
698698
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
699-
700-
// force F16 KV cache for the draft model for extra performance
701-
params_dft.cache_type_k = "f16";
702-
params_dft.cache_type_v = "f16";
699+
params_dft.n_parallel = 1;
703700

704701
common_init_result llama_init_dft = common_init_from_params(params_dft);
705702

@@ -719,8 +716,14 @@ struct server_context {
719716
return false;
720717
}
721718

719+
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
720+
722721
cparams_dft = common_context_params_to_llama(params_dft);
723-
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
722+
cparams_dft.n_batch = n_ctx_dft;
723+
724+
// force F16 KV cache for the draft model for extra performance
725+
cparams_dft.type_k = GGML_TYPE_F16;
726+
cparams_dft.type_v = GGML_TYPE_F16;
724727

725728
// the context is not needed - we will create one for each slot
726729
llama_free(llama_init_dft.context);
@@ -2312,6 +2315,10 @@ struct server_context {
23122315
continue;
23132316
}
23142317

2318+
if (slot.state != SLOT_STATE_GENERATING) {
2319+
continue;
2320+
}
2321+
23152322
llama_token id = slot.sampled;
23162323

23172324
struct common_speculative_params params_spec;

0 commit comments

Comments
 (0)