@@ -2385,6 +2385,10 @@ struct server_context {
23852385        llama_batch_free (batch);
23862386    }
23872387
2388+     int32_t  n_ctx_slot () const  {
2389+         return  params_base.kv_unified  ? n_ctx : n_ctx / params_base.n_parallel ;
2390+     }
2391+ 
23882392    bool  load_model (const  common_params & params) {
23892393        SRV_INF (" loading model '%s'\n "  , params.model .path .c_str ());
23902394
@@ -2413,7 +2417,7 @@ struct server_context {
24132417
24142418            params_dft.devices       = params_base.speculative .devices ;
24152419            params_dft.model         = params_base.speculative .model ;
2416-             params_dft.n_ctx         = params_base.speculative .n_ctx  == 0  ? slots. front (). n_ctx  : params_base.speculative .n_ctx ;
2420+             params_dft.n_ctx         = params_base.speculative .n_ctx  == 0  ? n_ctx_slot ()  : params_base.speculative .n_ctx ;
24172421            params_dft.n_gpu_layers  = params_base.speculative .n_gpu_layers ;
24182422            params_dft.n_parallel    = 1 ;
24192423            params_dft.cache_type_k  = params_base.speculative .cache_type_k ;
@@ -2501,16 +2505,14 @@ struct server_context {
25012505    }
25022506
25032507    void  init () {
2504-         const  int32_t  n_ctx_slot = params_base.kv_unified  ? n_ctx : n_ctx / params_base.n_parallel ;
2505- 
25062508        SRV_INF (" initializing slots, n_slots = %d\n "  , params_base.n_parallel );
25072509
25082510        for  (int  i = 0 ; i < params_base.n_parallel ; i++) {
25092511            server_slot slot;
25102512
25112513            slot.id  = i;
25122514            slot.ctx  = ctx;
2513-             slot.n_ctx  = n_ctx_slot;
2515+             slot.n_ctx  = n_ctx_slot () ;
25142516            slot.mctx  = mctx;
25152517            slot.prompt .tokens .has_mtmd  = mctx != nullptr ;
25162518
@@ -2533,7 +2535,7 @@ struct server_context {
25332535                }
25342536            }
25352537
2536-             SLT_INF (slot, " new slot n_ctx_slot  = %d\n "  , slot.n_ctx );
2538+             SLT_INF (slot, " new slot, n_ctx  = %d\n "  , slot.n_ctx );
25372539
25382540            slot.callback_on_release  = [this ](int ) {
25392541                queue_tasks.pop_deferred_task ();
@@ -3718,7 +3720,7 @@ struct server_context {
37183720                        slot.n_past  = 0 ;
37193721                        slot.state  = SLOT_STATE_PROCESSING_PROMPT;
37203722
3721-                         SLT_INF (slot, " new prompt, n_ctx_slot  = %d, n_keep = %d, n_prompt_tokens = %d\n "  ,
3723+                         SLT_INF (slot, " new prompt, n_ctx  = %d, n_keep = %d, n_prompt_tokens = %d\n "  ,
37223724                                slot.n_ctx , slot.task ->params .n_keep , slot.n_prompt_tokens ());
37233725
37243726                        //  print prompt tokens (for debugging)
0 commit comments