@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145 }
146146}
147147
148+ const std::vector<ggml_type> kv_cache_types = {
149+ GGML_TYPE_F32,
150+ GGML_TYPE_F16,
151+ GGML_TYPE_BF16,
152+ GGML_TYPE_Q8_0,
153+ GGML_TYPE_Q4_0,
154+ GGML_TYPE_Q4_1,
155+ GGML_TYPE_IQ4_NL,
156+ GGML_TYPE_Q5_0,
157+ GGML_TYPE_Q5_1,
158+ };
159+
160+ static ggml_type kv_cache_type_from_str (const std::string & s) {
161+ for (const auto & type : kv_cache_types) {
162+ if (ggml_type_name (type) == s) {
163+ return type;
164+ }
165+ }
166+ throw std::runtime_error (" Unsupported cache type: " + s);
167+ }
168+
169+ static std::string get_all_kv_cache_types () {
170+ std::ostringstream msg;
171+ for (const auto & type : kv_cache_types) {
172+ msg << ggml_type_name (type) << (&type == &kv_cache_types.back () ? " " : " , " );
173+ }
174+ return msg.str ();
175+ }
176+
148177//
149178// CLI argument parsing functions
150179//
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11741203 ).set_env (" LLAMA_ARG_NO_KV_OFFLOAD" ));
11751204 add_opt (common_arg (
11761205 {" -ctk" , " --cache-type-k" }, " TYPE" ,
1177- string_format (" KV cache data type for K (default: %s)" , params.cache_type_k .c_str ()),
1206+ string_format (
1207+ " KV cache data type for K\n "
1208+ " allowed values: %s\n "
1209+ " (default: %s)" ,
1210+ get_all_kv_cache_types ().c_str (),
1211+ ggml_type_name (params.cache_type_k )
1212+ ),
11781213 [](common_params & params, const std::string & value) {
1179- // TODO: get the type right here
1180- params.cache_type_k = value;
1214+ params.cache_type_k = kv_cache_type_from_str (value);
11811215 }
11821216 ).set_env (" LLAMA_ARG_CACHE_TYPE_K" ));
11831217 add_opt (common_arg (
11841218 {" -ctv" , " --cache-type-v" }, " TYPE" ,
1185- string_format (" KV cache data type for V (default: %s)" , params.cache_type_v .c_str ()),
1219+ string_format (
1220+ " KV cache data type for V\n "
1221+ " allowed values: %s\n "
1222+ " (default: %s)" ,
1223+ get_all_kv_cache_types ().c_str (),
1224+ ggml_type_name (params.cache_type_v )
1225+ ),
11861226 [](common_params & params, const std::string & value) {
1187- // TODO: get the type right here
1188- params.cache_type_v = value;
1227+ params.cache_type_v = kv_cache_type_from_str (value);
11891228 }
11901229 ).set_env (" LLAMA_ARG_CACHE_TYPE_V" ));
11911230 add_opt (common_arg (
@@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832122 [](common_params & params, int value) {
20842123 params.speculative .n_max = value;
20852124 }
2086- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2125+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MAX " ) );
20872126 add_opt (common_arg (
20882127 {" --draft-min" , " --draft-n-min" }, " N" ,
20892128 string_format (" minimum number of draft tokens to use for speculative decoding (default: %d)" , params.speculative .n_min ),
20902129 [](common_params & params, int value) {
20912130 params.speculative .n_min = value;
20922131 }
2093- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2132+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_MIN " ) );
20942133 add_opt (common_arg (
20952134 {" --draft-p-split" }, " P" ,
20962135 string_format (" speculative decoding split probability (default: %.1f)" , (double )params.speculative .p_split ),
20972136 [](common_params & params, const std::string & value) {
20982137 params.speculative .p_split = std::stof (value);
20992138 }
2100- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
2139+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}). set_env ( " LLAMA_ARG_DRAFT_P_SPLIT " ) );
21012140 add_opt (common_arg (
21022141 {" --draft-p-min" }, " P" ,
21032142 string_format (" minimum speculative decoding probability (greedy) (default: %.1f)" , (double )params.speculative .p_min ),
21042143 [](common_params & params, const std::string & value) {
21052144 params.speculative .p_min = std::stof (value);
21062145 }
2107- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2146+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_DRAFT_P_MIN " ) );
21082147 add_opt (common_arg (
21092148 {" -cd" , " --ctx-size-draft" }, " N" ,
21102149 string_format (" size of the prompt context for the draft model (default: %d, 0 = loaded from model)" , params.speculative .n_ctx ),
21112150 [](common_params & params, int value) {
21122151 params.speculative .n_ctx = value;
21132152 }
2114- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2153+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_CTX_SIZE_DRAFT " ) );
21152154 add_opt (common_arg (
21162155 {" -devd" , " --device-draft" }, " <dev1,dev2,..>" ,
21172156 " comma-separated list of devices to use for offloading the draft model (none = don't offload)\n "
@@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312170 fprintf (stderr, " warning: consult docs/build.md for compilation instructions\n " );
21322171 }
21332172 }
2134- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2173+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_N_GPU_LAYERS_DRAFT " ) );
21352174 add_opt (common_arg (
21362175 {" -md" , " --model-draft" }, " FNAME" ,
21372176 " draft model for speculative decoding (default: unused)" ,
21382177 [](common_params & params, const std::string & value) {
21392178 params.speculative .model = value;
21402179 }
2141- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2180+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}). set_env ( " LLAMA_ARG_MODEL_DRAFT " ) );
21422181
21432182 return ctx_arg;
21442183}
0 commit comments