@@ -609,7 +609,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
609609 [](common_params & params, int value) {
610610 params.n_draft = value;
611611 }
612- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
612+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
613+ add_opt (common_arg (
614+ {" --draft-min" }, " N" ,
615+ string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" , params.n_draft_min ),
616+ [](common_params & params, int value) {
617+ params.n_draft_min = value;
618+ }
619+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
613620 add_opt (common_arg (
614621 {" -ps" , " --p-split" }, " N" ,
615622 string_format (" speculative decoding split probability (default: %.1f)" , (double )params.p_split ),
@@ -1454,7 +1461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14541461 fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
14551462 }
14561463 }
1457- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
1464+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
14581465 add_opt (common_arg (
14591466 {" -sm" , " --split-mode" }, " {none,layer,row}" ,
14601467 " how to split the model across multiple GPUs, one of:\n "
@@ -1599,7 +1606,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15991606 [](common_params & params, const std::string & value) {
16001607 params.model_draft = value;
16011608 }
1602- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
1609+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
16031610 add_opt (common_arg (
16041611 {" -mu" , " --model-url" }, " MODEL_URL" ,
16051612 " model download url (default: unused)" ,
0 commit comments