@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145    }
146146}
147147
148+ const  std::vector<ggml_type> kv_cache_types = {
149+     GGML_TYPE_F32,
150+     GGML_TYPE_F16,
151+     GGML_TYPE_BF16,
152+     GGML_TYPE_Q8_0,
153+     GGML_TYPE_Q4_0,
154+     GGML_TYPE_Q4_1,
155+     GGML_TYPE_IQ4_NL,
156+     GGML_TYPE_Q5_0,
157+     GGML_TYPE_Q5_1,
158+ };
159+ 
160+ static  ggml_type kv_cache_type_from_str (const  std::string & s) {
161+     for  (const  auto  & type : kv_cache_types) {
162+         if  (ggml_type_name (type) == s) {
163+             return  type;
164+         }
165+     }
166+     throw  std::runtime_error (" Unsupported cache type: "   + s);
167+ }
168+ 
169+ static  std::string get_all_kv_cache_types () {
170+     std::ostringstream msg;
171+     for  (const  auto  & type : kv_cache_types) {
172+         msg << ggml_type_name (type) << (&type == &kv_cache_types.back () ? " "   : " , "  );
173+     }
174+     return  msg.str ();
175+ }
176+ 
148177// 
149178//  CLI argument parsing functions
150179// 
@@ -1184,18 +1213,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11841213    ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD"  ));
11851214    add_opt (common_arg (
11861215        {" -ctk"  , " --cache-type-k"  }, " TYPE"  ,
1187-         string_format (" KV cache data type for K (default: %s)"  , params.cache_type_k .c_str ()),
1216+         string_format (
1217+             " KV cache data type for K\n " 
1218+             " allowed values: %s\n " 
1219+             " (default: %s)"  ,
1220+             get_all_kv_cache_types ().c_str (),
1221+             ggml_type_name (params.cache_type_k )
1222+         ),
11881223        [](common_params & params, const  std::string & value) {
1189-             //  TODO: get the type right here
1190-             params.cache_type_k  = value;
1224+             params.cache_type_k  = kv_cache_type_from_str (value);
11911225        }
11921226    ).set_env (" LLAMA_ARG_CACHE_TYPE_K"  ));
11931227    add_opt (common_arg (
11941228        {" -ctv"  , " --cache-type-v"  }, " TYPE"  ,
1195-         string_format (" KV cache data type for V (default: %s)"  , params.cache_type_v .c_str ()),
1229+         string_format (
1230+             " KV cache data type for V\n " 
1231+             " allowed values: %s\n " 
1232+             " (default: %s)"  ,
1233+             get_all_kv_cache_types ().c_str (),
1234+             ggml_type_name (params.cache_type_v )
1235+         ),
11961236        [](common_params & params, const  std::string & value) {
1197-             //  TODO: get the type right here
1198-             params.cache_type_v  = value;
1237+             params.cache_type_v  = kv_cache_type_from_str (value);
11991238        }
12001239    ).set_env (" LLAMA_ARG_CACHE_TYPE_V"  ));
12011240    add_opt (common_arg (
@@ -2093,35 +2132,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20932132        [](common_params & params, int  value) {
20942133            params.speculative .n_max  = value;
20952134        }
2096-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2135+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
20972136    add_opt (common_arg (
20982137        {" --draft-min"  , " --draft-n-min"  }, " N"  ,
20992138        string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)"  , params.speculative .n_min ),
21002139        [](common_params & params, int  value) {
21012140            params.speculative .n_min  = value;
21022141        }
2103-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2142+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
21042143    add_opt (common_arg (
21052144        {" --draft-p-split"  }, " P"  ,
21062145        string_format (" speculative decoding split probability (default: %.1f)"  , (double )params.speculative .p_split ),
21072146        [](common_params & params, const  std::string & value) {
21082147            params.speculative .p_split  = std::stof (value);
21092148        }
2110-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2149+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
21112150    add_opt (common_arg (
21122151        {" --draft-p-min"  }, " P"  ,
21132152        string_format (" minimum speculative decoding probability (greedy) (default: %.1f)"  , (double )params.speculative .p_min ),
21142153        [](common_params & params, const  std::string & value) {
21152154            params.speculative .p_min  = std::stof (value);
21162155        }
2117-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2156+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
21182157    add_opt (common_arg (
21192158        {" -cd"  , " --ctx-size-draft"  }, " N"  ,
21202159        string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)"  , params.speculative .n_ctx ),
21212160        [](common_params & params, int  value) {
21222161            params.speculative .n_ctx  = value;
21232162        }
2124-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2163+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
21252164    add_opt (common_arg (
21262165        {" -devd"  , " --device-draft"  }, " <dev1,dev2,..>"  ,
21272166        " comma-separated list of devices to use for offloading the draft model (none = don't offload)\n " 
@@ -2141,14 +2180,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21412180                fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n "  );
21422181            }
21432182        }
2144-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2183+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
21452184    add_opt (common_arg (
21462185        {" -md"  , " --model-draft"  }, " FNAME"  ,
21472186        " draft model for speculative decoding (default: unused)"  ,
21482187        [](common_params & params, const  std::string & value) {
21492188            params.speculative .model  = value;
21502189        }
2151-     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2190+     ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
21522191
21532192    return  ctx_arg;
21542193}
0 commit comments