@@ -2083,35 +2083,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832083        [](common_params & params, int  value) {
20842084            params.speculative .n_max  = value;
20852085        }
2086-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2086+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
20872087    add_opt (common_arg (
20882088        {" --draft-min" " --draft-n-min" " N" 
20892089        string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" speculative .n_min ),
20902090        [](common_params & params, int  value) {
20912091            params.speculative .n_min  = value;
20922092        }
2093-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2093+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
20942094    add_opt (common_arg (
20952095        {" --draft-p-split" " P" 
20962096        string_format (" speculative decoding split probability (default: %.1f)" double )params.speculative .p_split ),
20972097        [](common_params & params, const  std::string & value) {
20982098            params.speculative .p_split  = std::stof (value);
20992099        }
2100-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2100+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
21012101    add_opt (common_arg (
21022102        {" --draft-p-min" " P" 
21032103        string_format (" minimum speculative decoding probability (greedy) (default: %.1f)" double )params.speculative .p_min ),
21042104        [](common_params & params, const  std::string & value) {
21052105            params.speculative .p_min  = std::stof (value);
21062106        }
2107-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2107+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
21082108    add_opt (common_arg (
21092109        {" -cd" " --ctx-size-draft" " N" 
21102110        string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)" speculative .n_ctx ),
21112111        [](common_params & params, int  value) {
21122112            params.speculative .n_ctx  = value;
21132113        }
2114-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2114+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
21152115    add_opt (common_arg (
21162116        {" -devd" " --device-draft" " <dev1,dev2,..>" 
21172117        " comma-separated list of devices to use for offloading the draft model (none = don't offload)\n " 
@@ -2131,14 +2131,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312131                fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n " 
21322132            }
21332133        }
2134-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2134+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
21352135    add_opt (common_arg (
21362136        {" -md" " --model-draft" " FNAME" 
21372137        " draft model for speculative decoding (default: unused)" 
21382138        [](common_params & params, const  std::string & value) {
21392139            params.speculative .model  = value;
21402140        }
2141-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2141+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
21422142
21432143    return  ctx_arg;
21442144}
0 commit comments