@@ -2083,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832083 [](common_params & params, int value) {
20842084 params.speculative .n_max = value;
20852085 }
2086- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2086+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
20872087 add_opt (common_arg (
20882088 {" --draft-min" , " --draft-n-min" }, " N" ,
20892089 string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" , params.speculative .n_min ),
20902090 [](common_params & params, int value) {
20912091 params.speculative .n_min = value;
20922092 }
2093- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2093+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
20942094 add_opt (common_arg (
20952095 {" --draft-p-split" }, " P" ,
20962096 string_format (" speculative decoding split probability (default: %.1f)" , (double )params.speculative .p_split ),
20972097 [](common_params & params, const std::string & value) {
20982098 params.speculative .p_split = std::stof (value);
20992099 }
2100- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2100+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
21012101 add_opt (common_arg (
21022102 {" --draft-p-min" }, " P" ,
21032103 string_format (" minimum speculative decoding probability (greedy) (default: %.1f)" , (double )params.speculative .p_min ),
21042104 [](common_params & params, const std::string & value) {
21052105 params.speculative .p_min = std::stof (value);
21062106 }
2107- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2107+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
21082108 add_opt (common_arg (
21092109 {" -cd" , " --ctx-size-draft" }, " N" ,
21102110 string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)" , params.speculative .n_ctx ),
21112111 [](common_params & params, int value) {
21122112 params.speculative .n_ctx = value;
21132113 }
2114- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2114+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
21152115 add_opt (common_arg (
21162116 {" -devd" , " --device-draft" }, " <dev1,dev2,..>" ,
21172117 " comma-separated list of devices to use for offloading the draft model (none = don't offload)\n "
@@ -2131,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312131 fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n " );
21322132 }
21332133 }
2134- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2134+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
21352135 add_opt (common_arg (
21362136 {" -md" , " --model-draft" }, " FNAME" ,
21372137 " draft model for speculative decoding (default: unused)" ,
21382138 [](common_params & params, const std::string & value) {
21392139 params.speculative .model = value;
21402140 }
2141- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2141+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
21422142
21432143 return ctx_arg;
21442144}
0 commit comments