@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15451545        }
15461546    ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15471547    add_opt (common_arg (
1548-         {" -fa"  , " --flash-attn"  },
1549-         string_format (" enable Flash Attention (default: %s)"  , params.flash_attn  ? " enabled"   : " disabled"  ),
1550-         [](common_params & params) {
1551-             params.flash_attn  = true ;
1548+         {" -fa"  , " --flash-attn"  }, " FA"  ,
1549+         string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')"  , llama_flash_attn_type_name (params.flash_attn_type )),
1550+         [](common_params & params, const  std::string & value) {
1551+             if  (value == " on"   || value == " enabled"  ) {
1552+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553+             } else  if  (value == " off"   || value == " disabled"  ) {
1554+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555+             } else  if  (value == " auto"  ) {
1556+                 params.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557+             } else  {
1558+                 throw  std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n "  , value.c_str ()));
1559+             }
15521560        }
15531561    ).set_env (" LLAMA_ARG_FLASH_ATTN"  ));
15541562    add_opt (common_arg (
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593467            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"  ;
34603468            params.model .hf_file  = " qwen2.5-coder-1.5b-q8_0.gguf"  ;
34613469            params.port  = 8012 ;
3462-             params.n_gpu_layers  = 99 ;
3463-             params.flash_attn  = true ;
34643470            params.n_ubatch  = 1024 ;
34653471            params.n_batch  = 1024 ;
34663472            params.n_ctx  = 0 ;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753481            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"  ;
34763482            params.model .hf_file  = " qwen2.5-coder-3b-q8_0.gguf"  ;
34773483            params.port  = 8012 ;
3478-             params.n_gpu_layers  = 99 ;
3479-             params.flash_attn  = true ;
34803484            params.n_ubatch  = 1024 ;
34813485            params.n_batch  = 1024 ;
34823486            params.n_ctx  = 0 ;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913495            params.model .hf_repo  = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"  ;
34923496            params.model .hf_file  = " qwen2.5-coder-7b-q8_0.gguf"  ;
34933497            params.port  = 8012 ;
3494-             params.n_gpu_layers  = 99 ;
3495-             params.flash_attn  = true ;
34963498            params.n_ubatch  = 1024 ;
34973499            params.n_batch  = 1024 ;
34983500            params.n_ctx  = 0 ;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083510            params.model .hf_file  = " qwen2.5-coder-7b-q8_0.gguf"  ;
35093511            params.speculative .model .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"  ;
35103512            params.speculative .model .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf"  ;
3511-             params.speculative .n_gpu_layers  = 99 ;
35123513            params.port  = 8012 ;
3513-             params.n_gpu_layers  = 99 ;
3514-             params.flash_attn  = true ;
35153514            params.n_ubatch  = 1024 ;
35163515            params.n_batch  = 1024 ;
35173516            params.n_ctx  = 0 ;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273526            params.model .hf_file  = " qwen2.5-coder-14b-q8_0.gguf"  ;
35283527            params.speculative .model .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"  ;
35293528            params.speculative .model .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf"  ;
3530-             params.speculative .n_gpu_layers  = 99 ;
35313529            params.port  = 8012 ;
3532-             params.n_gpu_layers  = 99 ;
3533-             params.flash_attn  = true ;
35343530            params.n_ubatch  = 1024 ;
35353531            params.n_batch  = 1024 ;
35363532            params.n_ctx  = 0 ;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35453541            params.model .hf_repo  = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF"  ;
35463542            params.model .hf_file  = " qwen3-coder-30b-a3b-instruct-q8_0.gguf"  ;
35473543            params.port  = 8012 ;
3548-             params.n_gpu_layers  = 99 ;
3549-             params.flash_attn  = true ;
35503544            params.n_ubatch  = 1024 ;
35513545            params.n_batch  = 1024 ;
35523546            params.n_ctx  = 0 ;
0 commit comments