@@ -681,25 +681,26 @@ struct server_context {
681681 add_bos_token = llama_add_bos_token (model);
682682 has_eos_token = !llama_add_eos_token (model);
683683
684- if (!params.model_draft .empty ()) {
685- SRV_INF (" loading draft model '%s'\n " , params_. model_draft .c_str ());
684+ if (!params.speculative . model .empty ()) {
685+ SRV_INF (" loading draft model '%s'\n " , params. speculative . model .c_str ());
686686
687687 auto params_dft = params;
688688
689- params_dft.model = params.model_draft ;
690- params_dft.n_gpu_layers = params.n_gpu_layers_draft ;
689+ params_dft.model = params.speculative .model ;
690+ params_dft.n_ctx = params.speculative .n_ctx ;
691+ params_dft.n_gpu_layers = params.speculative .n_gpu_layers ;
691692
692693 common_init_result llama_init_dft = common_init_from_params (params_dft);
693694
694695 model_dft = llama_init_dft.model ;
695696
696697 if (model_dft == nullptr ) {
697- SRV_ERR (" failed to load draft model, '%s'\n " , params.model_draft .c_str ());
698+ SRV_ERR (" failed to load draft model, '%s'\n " , params.speculative . model .c_str ());
698699 return false ;
699700 }
700701
701702 if (!common_speculative_are_compatible (ctx, llama_init_dft.context )) {
702- SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.model_draft .c_str (), params.model .c_str ());
703+ SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.speculative . model .c_str (), params.model .c_str ());
703704
704705 llama_free (llama_init_dft.context );
705706 llama_free_model (llama_init_dft.model );
@@ -755,7 +756,7 @@ struct server_context {
755756 return ;
756757 }
757758
758- slot.batch_spec = llama_batch_init (params.n_draft + 1 , 0 , 1 );
759+ slot.batch_spec = llama_batch_init (params.speculative . n_max + 1 , 0 , 1 );
759760 }
760761
761762 SLT_INF (slot, " new slot n_ctx_slot = %d\n " , slot.n_ctx );
@@ -2287,13 +2288,14 @@ struct server_context {
22872288
22882289 // TODO: configurable through requests
22892290 struct common_speculative_params params_spec;
2290- params_spec.n_draft = params.n_draft ;
2291+ params_spec.n_draft = params.speculative . n_max ;
22912292 params_spec.n_reuse = 256 ;
2292- params_spec.p_min = 0 . 9f ;
2293+ params_spec.p_min = params. speculative . p_min ;
22932294
22942295 llama_tokens draft = common_speculative_gen_draft (slot.spec , params_spec, slot.cache_tokens , id);
22952296
2296- if (params.n_draft_min > (int ) draft.size ()) {
2297+ // ignore small drafts
2298+ if (params.speculative .n_min > (int ) draft.size ()) {
22972299 continue ;
22982300 }
22992301
@@ -2321,9 +2323,7 @@ struct server_context {
23212323 for (size_t i = 0 ; i < ids.size (); ++i) {
23222324 completion_token_output result;
23232325
2324- id = ids[i];
2325-
2326- result.tok = id;
2326+ result.tok = ids[i];
23272327
23282328 if (!process_token (result, slot)) {
23292329 // release slot because of stop condition
0 commit comments