@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15451545        }
15461546    ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15471547    add_opt (common_arg (
1548-         {" -fa" " --flash-attn" 
1549-         string_format (" enable Flash Attention (default: %s)" flash_attn  ? " enabled" " disabled" 
1550-         [](common_params & params) {
1551-             params.flash_attn  = true ;
1548+         {" -fa" " --flash-attn" " FA" 
1549+         string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" llama_flash_attn_type_name (params.flash_attn_type )),
1550+         [](common_params & params, const  std::string & value) {
1551+             if  (value == " on" " enabled" " 1" 
1552+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553+             } else  if  (value == " off" " disabled" " 0" 
1554+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555+             } else  if  (value == " auto" " -1" 
1556+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557+             } else  {
1558+                 throw  std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n " c_str ()));
1559+             }
15521560        }
15531561    ).set_env (" LLAMA_ARG_FLASH_ATTN" 
15541562    add_opt (common_arg (
@@ -2954,20 +2962,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29542962            params.endpoint_metrics  = true ;
29552963        }
29562964    ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_METRICS" 
2957-     add_opt (common_arg (
2958-         {" --slots" 
2959-         string_format (" enable slots monitoring endpoint (default: %s)" endpoint_slots  ? " enabled" " disabled" 
2960-         [](common_params & params) {
2961-             params.endpoint_slots  = true ;
2962-         }
2963-     ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" 
29642965    add_opt (common_arg (
29652966        {" --props" 
29662967        string_format (" enable changing global properties via POST /props (default: %s)" endpoint_props  ? " enabled" " disabled" 
29672968        [](common_params & params) {
29682969            params.endpoint_props  = true ;
29692970        }
29702971    ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_PROPS" 
2972+     add_opt (common_arg (
2973+         {" --slots" 
2974+         string_format (" enable slots monitoring endpoint (default: %s)" endpoint_slots  ? " enabled" " disabled" 
2975+         [](common_params & params) {
2976+             params.endpoint_slots  = true ;
2977+         }
2978+     ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" 
29712979    add_opt (common_arg (
29722980        {" --no-slots" 
29732981        " disables slots monitoring endpoint" 
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593467            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" 
34603468            params.model .hf_file  = " qwen2.5-coder-1.5b-q8_0.gguf" 
34613469            params.port  = 8012 ;
3462-             params.n_gpu_layers  = 99 ;
3463-             params.flash_attn  = true ;
34643470            params.n_ubatch  = 1024 ;
34653471            params.n_batch  = 1024 ;
34663472            params.n_ctx  = 0 ;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753481            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" 
34763482            params.model .hf_file  = " qwen2.5-coder-3b-q8_0.gguf" 
34773483            params.port  = 8012 ;
3478-             params.n_gpu_layers  = 99 ;
3479-             params.flash_attn  = true ;
34803484            params.n_ubatch  = 1024 ;
34813485            params.n_batch  = 1024 ;
34823486            params.n_ctx  = 0 ;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913495            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" 
34923496            params.model .hf_file  = " qwen2.5-coder-7b-q8_0.gguf" 
34933497            params.port  = 8012 ;
3494-             params.n_gpu_layers  = 99 ;
3495-             params.flash_attn  = true ;
34963498            params.n_ubatch  = 1024 ;
34973499            params.n_batch  = 1024 ;
34983500            params.n_ctx  = 0 ;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083510            params.model .hf_file  = " qwen2.5-coder-7b-q8_0.gguf" 
35093511            params.speculative .model .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" 
35103512            params.speculative .model .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf" 
3511-             params.speculative .n_gpu_layers  = 99 ;
35123513            params.port  = 8012 ;
3513-             params.n_gpu_layers  = 99 ;
3514-             params.flash_attn  = true ;
35153514            params.n_ubatch  = 1024 ;
35163515            params.n_batch  = 1024 ;
35173516            params.n_ctx  = 0 ;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273526            params.model .hf_file  = " qwen2.5-coder-14b-q8_0.gguf" 
35283527            params.speculative .model .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" 
35293528            params.speculative .model .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf" 
3530-             params.speculative .n_gpu_layers  = 99 ;
35313529            params.port  = 8012 ;
3532-             params.n_gpu_layers  = 99 ;
3533-             params.flash_attn  = true ;
35343530            params.n_ubatch  = 1024 ;
35353531            params.n_batch  = 1024 ;
35363532            params.n_ctx  = 0 ;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35453541            params.model .hf_repo  = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" 
35463542            params.model .hf_file  = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" 
35473543            params.port  = 8012 ;
3548-             params.n_gpu_layers  = 99 ;
3549-             params.flash_attn  = true ;
35503544            params.n_ubatch  = 1024 ;
35513545            params.n_batch  = 1024 ;
35523546            params.n_ctx  = 0 ;
0 commit comments