@@ -241,6 +241,10 @@ struct server_slot {
241241 return state != SLOT_STATE_IDLE;
242242 }
243243
244+ bool can_speculate () const {
245+ return ctx_dft && params.speculative .n_max > 0 && params.cache_prompt ;
246+ }
247+
244248 void add_token (const completion_token_output & token) {
245249 if (!is_processing ()) {
246250 SLT_WRN (*this , " %s" , " slot is not processing\n " );
@@ -1270,7 +1274,7 @@ struct server_context {
12701274 {" min_keep" , slot.params .sampling .min_keep },
12711275 {" grammar" , slot.params .sampling .grammar },
12721276 {" samplers" , samplers},
1273- {" speculative" , slot.params . speculative . model . empty () ? false : true },
1277+ {" speculative" , slot.can_speculate () },
12741278 {" speculative.n_max" , slot.params .speculative .n_max },
12751279 {" speculative.n_min" , slot.params .speculative .n_min },
12761280 {" speculative.p_min" , slot.params .speculative .p_min },
@@ -2302,11 +2306,10 @@ struct server_context {
23022306 }
23032307
23042308 // check if the slot supports speculative decoding
2305- if (!slot.ctx_dft || slot. params . speculative . n_max <= 0 || !slot. params . cache_prompt ) {
2309+ if (!slot.can_speculate () ) {
23062310 continue ;
23072311 }
23082312
2309- // TODO: configurable through requests
23102313 struct common_speculative_params params_spec;
23112314 params_spec.n_draft = slot.params .speculative .n_max ;
23122315 params_spec.n_reuse = llama_n_ctx (slot.ctx_dft ) - slot.params .speculative .n_max ;
0 commit comments