@@ -1263,6 +1263,18 @@ static std::string list_builtin_chat_templates() {
12631263 return msg.str ();
12641264}
12651265
1266+ static bool is_truthy (const std::string & value) {
1267+ return value == " on" || value == " enabled" || value == " 1" ;
1268+ }
1269+
1270+ static bool is_falsey (const std::string & value) {
1271+ return value == " off" || value == " disabled" || value == " 0" ;
1272+ }
1273+
1274+ static bool is_autoy (const std::string & value) {
1275+ return value == " auto" || value == " -1" ;
1276+ }
1277+
12661278common_params_context common_params_parser_init (common_params & params, llama_example ex, void (*print_usage)(int , char **)) {
12671279 // load dynamic backends
12681280 ggml_backend_load_all ();
@@ -1544,13 +1556,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15441556 params.n_chunks = value;
15451557 }
15461558 ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547- add_opt (common_arg (
1548- {" -fa" , " --flash-attn" },
1549- string_format (" enable Flash Attention (default: %s)" , params.flash_attn ? " enabled" : " disabled" ),
1550- [](common_params & params) {
1551- params.flash_attn = true ;
1552- }
1553- ).set_env (" LLAMA_ARG_FLASH_ATTN" ));
1559+ add_opt (common_arg ({ " -fa" , " --flash-attn" }, " [on|off|auto]" ,
1560+ string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" ,
1561+ llama_flash_attn_type_name (params.flash_attn_type )),
1562+ [](common_params & params, const std::string & value) {
1563+ if (is_truthy (value)) {
1564+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1565+ } else if (is_falsey (value)) {
1566+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1567+ } else if (is_autoy (value)) {
1568+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1569+ } else {
1570+ throw std::runtime_error (
1571+ string_format (" error: unkown value for --flash-attn: '%s'\n " , value.c_str ()));
1572+ }
1573+ }).set_env (" LLAMA_ARG_FLASH_ATTN" ));
15541574 add_opt (common_arg (
15551575 {" -p" , " --prompt" }, " PROMPT" ,
15561576 " prompt to start generation with; for system message, use -sys" ,
@@ -2458,7 +2478,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24582478 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_N_CPU_MOE_DRAFT" ));
24592479 add_opt (common_arg (
24602480 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
2461- " number of layers to store in VRAM" ,
2481+ string_format ( " max. number of layers to store in VRAM (default: %d) " , params. n_gpu_layers ) ,
24622482 [](common_params & params, int value) {
24632483 params.n_gpu_layers = value;
24642484 if (!llama_supports_gpu_offload ()) {
@@ -2954,20 +2974,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29542974 params.endpoint_metrics = true ;
29552975 }
29562976 ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_METRICS" ));
2957- add_opt (common_arg (
2958- {" --slots" },
2959- string_format (" enable slots monitoring endpoint (default: %s)" , params.endpoint_slots ? " enabled" : " disabled" ),
2960- [](common_params & params) {
2961- params.endpoint_slots = true ;
2962- }
2963- ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" ));
29642977 add_opt (common_arg (
29652978 {" --props" },
29662979 string_format (" enable changing global properties via POST /props (default: %s)" , params.endpoint_props ? " enabled" : " disabled" ),
29672980 [](common_params & params) {
29682981 params.endpoint_props = true ;
29692982 }
29702983 ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_PROPS" ));
2984+ add_opt (common_arg (
2985+ {" --slots" },
2986+ string_format (" enable slots monitoring endpoint (default: %s)" , params.endpoint_slots ? " enabled" : " disabled" ),
2987+ [](common_params & params) {
2988+ params.endpoint_slots = true ;
2989+ }
2990+ ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" ));
29712991 add_opt (common_arg (
29722992 {" --no-slots" },
29732993 " disables slots monitoring endpoint" ,
@@ -3126,13 +3146,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31263146 common_log_set_file (common_log_main (), value.c_str ());
31273147 }
31283148 ));
3129- add_opt (common_arg (
3130- {" --log-colors" },
3131- " Enable colored logging" ,
3132- [](common_params &) {
3133- common_log_set_colors (common_log_main (), true );
3134- }
3135- ).set_env (" LLAMA_LOG_COLORS" ));
3149+ add_opt (common_arg ({ " --log-colors" }, " [on|off|auto]" ,
3150+ " Set colored logging ('on', 'off', or 'auto', default: 'auto')\n "
3151+ " 'auto' enables colors when output is to a terminal" ,
3152+ [](common_params &, const std::string & value) {
3153+ if (is_truthy (value)) {
3154+ common_log_set_colors (common_log_main (), LOG_COLORS_ENABLED);
3155+ } else if (is_falsey (value)) {
3156+ common_log_set_colors (common_log_main (), LOG_COLORS_DISABLED);
3157+ } else if (is_autoy (value)) {
3158+ common_log_set_colors (common_log_main (), LOG_COLORS_AUTO);
3159+ } else {
3160+ throw std::invalid_argument (
3161+ string_format (" error: unkown value for --log-colors: '%s'\n " , value.c_str ()));
3162+ }
3163+ }).set_env (" LLAMA_LOG_COLORS" ));
31363164 add_opt (common_arg (
31373165 {" -v" , " --verbose" , " --log-verbose" },
31383166 " Set verbosity level to infinity (i.e. log all messages, useful for debugging)" ,
@@ -3459,8 +3487,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593487 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ;
34603488 params.model .hf_file = " qwen2.5-coder-1.5b-q8_0.gguf" ;
34613489 params.port = 8012 ;
3462- params.n_gpu_layers = 99 ;
3463- params.flash_attn = true ;
34643490 params.n_ubatch = 1024 ;
34653491 params.n_batch = 1024 ;
34663492 params.n_ctx = 0 ;
@@ -3475,8 +3501,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753501 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ;
34763502 params.model .hf_file = " qwen2.5-coder-3b-q8_0.gguf" ;
34773503 params.port = 8012 ;
3478- params.n_gpu_layers = 99 ;
3479- params.flash_attn = true ;
34803504 params.n_ubatch = 1024 ;
34813505 params.n_batch = 1024 ;
34823506 params.n_ctx = 0 ;
@@ -3491,8 +3515,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913515 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
34923516 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
34933517 params.port = 8012 ;
3494- params.n_gpu_layers = 99 ;
3495- params.flash_attn = true ;
34963518 params.n_ubatch = 1024 ;
34973519 params.n_batch = 1024 ;
34983520 params.n_ctx = 0 ;
@@ -3508,10 +3530,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083530 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
35093531 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35103532 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3511- params.speculative .n_gpu_layers = 99 ;
35123533 params.port = 8012 ;
3513- params.n_gpu_layers = 99 ;
3514- params.flash_attn = true ;
35153534 params.n_ubatch = 1024 ;
35163535 params.n_batch = 1024 ;
35173536 params.n_ctx = 0 ;
@@ -3527,10 +3546,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273546 params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
35283547 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35293548 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3530- params.speculative .n_gpu_layers = 99 ;
35313549 params.port = 8012 ;
3532- params.n_gpu_layers = 99 ;
3533- params.flash_attn = true ;
35343550 params.n_ubatch = 1024 ;
35353551 params.n_batch = 1024 ;
35363552 params.n_ctx = 0 ;
@@ -3545,8 +3561,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35453561 params.model .hf_repo = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ;
35463562 params.model .hf_file = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" ;
35473563 params.port = 8012 ;
3548- params.n_gpu_layers = 99 ;
3549- params.flash_attn = true ;
35503564 params.n_ubatch = 1024 ;
35513565 params.n_batch = 1024 ;
35523566 params.n_ctx = 0 ;
0 commit comments