@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145    }
146146}
147147
148+ const  std::vector<ggml_type> kv_cache_types = {
149+     GGML_TYPE_F32,
150+     GGML_TYPE_F16,
151+     GGML_TYPE_BF16,
152+     GGML_TYPE_Q8_0,
153+     GGML_TYPE_Q4_0,
154+     GGML_TYPE_Q4_1,
155+     GGML_TYPE_IQ4_NL,
156+     GGML_TYPE_Q5_0,
157+     GGML_TYPE_Q5_1,
158+ };
159+ 
160+ static  ggml_type kv_cache_type_from_str (const  std::string & s) {
161+     for  (const  auto  & type : kv_cache_types) {
162+         if  (ggml_type_name (type) == s) {
163+             return  type;
164+         }
165+     }
166+     throw  std::runtime_error (" Unsupported cache type: "   + s);
167+ }
168+ 
169+ static  std::string get_all_kv_cache_types () {
170+     std::ostringstream msg;
171+     for  (const  auto  & type : kv_cache_types) {
172+         msg << ggml_type_name (type) << (&type == &kv_cache_types.back () ? " "   : " , "  );
173+     }
174+     return  msg.str ();
175+ }
176+ 
148177// 
149178//  CLI argument parsing functions
150179// 
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11741203    ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD"  ));
11751204    add_opt (common_arg (
11761205        {" -ctk"  , " --cache-type-k"  }, " TYPE"  ,
1177-         string_format (" KV cache data type for K (default: %s)"  , params.cache_type_k .c_str ()),
1206+         string_format (
1207+             " KV cache data type for K\n " 
1208+             " allowed values: %s\n " 
1209+             " (default: %s)"  ,
1210+             get_all_kv_cache_types ().c_str (),
1211+             ggml_type_name (params.cache_type_k )
1212+         ),
11781213        [](common_params & params, const  std::string & value) {
1179-             //  TODO: get the type right here
1180-             params.cache_type_k  = value;
1214+             params.cache_type_k  = kv_cache_type_from_str (value);
11811215        }
11821216    ).set_env (" LLAMA_ARG_CACHE_TYPE_K"  ));
11831217    add_opt (common_arg (
11841218        {" -ctv"  , " --cache-type-v"  }, " TYPE"  ,
1185-         string_format (" KV cache data type for V (default: %s)"  , params.cache_type_v .c_str ()),
1219+         string_format (
1220+             " KV cache data type for V\n " 
1221+             " allowed values: %s\n " 
1222+             " (default: %s)"  ,
1223+             get_all_kv_cache_types ().c_str (),
1224+             ggml_type_name (params.cache_type_v )
1225+         ),
11861226        [](common_params & params, const  std::string & value) {
1187-             //  TODO: get the type right here
1188-             params.cache_type_v  = value;
1227+             params.cache_type_v  = kv_cache_type_from_str (value);
11891228        }
11901229    ).set_env (" LLAMA_ARG_CACHE_TYPE_V"  ));
11911230    add_opt (common_arg (
@@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832122        [](common_params & params, int  value) {
20842123            params.speculative .n_max  = value;
20852124        }
2086-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2125+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
20872126    add_opt (common_arg (
20882127        {" --draft-min"  , " --draft-n-min"  }, " N"  ,
20892128        string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)"  , params.speculative .n_min ),
20902129        [](common_params & params, int  value) {
20912130            params.speculative .n_min  = value;
20922131        }
2093-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2132+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
20942133    add_opt (common_arg (
20952134        {" --draft-p-split"  }, " P"  ,
20962135        string_format (" speculative decoding split probability (default: %.1f)"  , (double )params.speculative .p_split ),
20972136        [](common_params & params, const  std::string & value) {
20982137            params.speculative .p_split  = std::stof (value);
20992138        }
2100-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2139+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
21012140    add_opt (common_arg (
21022141        {" --draft-p-min"  }, " P"  ,
21032142        string_format (" minimum speculative decoding probability (greedy) (default: %.1f)"  , (double )params.speculative .p_min ),
21042143        [](common_params & params, const  std::string & value) {
21052144            params.speculative .p_min  = std::stof (value);
21062145        }
2107-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2146+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
21082147    add_opt (common_arg (
21092148        {" -cd"  , " --ctx-size-draft"  }, " N"  ,
21102149        string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)"  , params.speculative .n_ctx ),
21112150        [](common_params & params, int  value) {
21122151            params.speculative .n_ctx  = value;
21132152        }
2114-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2153+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
21152154    add_opt (common_arg (
21162155        {" -devd"  , " --device-draft"  }, " <dev1,dev2,..>"  ,
21172156        " comma-separated list of devices to use for offloading the draft model (none = don't offload)\n " 
@@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312170                fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n "  );
21322171            }
21332172        }
2134-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2173+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
21352174    add_opt (common_arg (
21362175        {" -md"  , " --model-draft"  }, " FNAME"  ,
21372176        " draft model for speculative decoding (default: unused)"  ,
21382177        [](common_params & params, const  std::string & value) {
21392178            params.speculative .model  = value;
21402179        }
2141-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2180+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
21422181
21432182    return  ctx_arg;
21442183}
0 commit comments