@@ -694,12 +694,9 @@ struct server_context {
694694
695695 params_dft.devices = params_base.speculative .devices ;
696696 params_dft.model = params_base.speculative .model ;
697- params_dft.n_ctx = params_base.speculative .n_ctx ;
697+ params_dft.n_ctx = params_base.speculative .n_ctx == 0 ? params_base. n_ctx / params_base. n_parallel : params_base. speculative . n_ctx ;
698698 params_dft.n_gpu_layers = params_base.speculative .n_gpu_layers ;
699-
700- // force F16 KV cache for the draft model for extra performance
701- params_dft.cache_type_k = " f16" ;
702- params_dft.cache_type_v = " f16" ;
699+ params_dft.n_parallel = 1 ;
703700
704701 common_init_result llama_init_dft = common_init_from_params (params_dft);
705702
@@ -719,8 +716,14 @@ struct server_context {
719716 return false ;
720717 }
721718
719+ const int n_ctx_dft = llama_n_ctx (llama_init_dft.context );
720+
722721 cparams_dft = common_context_params_to_llama (params_dft);
723- cparams_dft.n_batch = llama_n_ctx (llama_init_dft.context );
722+ cparams_dft.n_batch = n_ctx_dft;
723+
724+ // force F16 KV cache for the draft model for extra performance
725+ cparams_dft.type_k = GGML_TYPE_F16;
726+ cparams_dft.type_v = GGML_TYPE_F16;
724727
725728 // the context is not needed - we will create one for each slot
726729 llama_free (llama_init_dft.context );
@@ -2312,6 +2315,10 @@ struct server_context {
23122315 continue ;
23132316 }
23142317
2318+ if (slot.state != SLOT_STATE_GENERATING) {
2319+ continue ;
2320+ }
2321+
23152322 llama_token id = slot.sampled ;
23162323
23172324 struct common_speculative_params params_spec;
0 commit comments