@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15451545 }
15461546 ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15471547 add_opt (common_arg (
1548- {" -fa" , " --flash-attn" },
1549- string_format (" enable Flash Attention (default: %s)" , params.flash_attn ? " enabled" : " disabled" ),
1550- [](common_params & params) {
1551- params.flash_attn = true ;
1548+ {" -fa" , " --flash-attn" }, " FA" ,
1549+ string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" , llama_flash_attn_type_name (params.flash_attn_type )),
1550+ [](common_params & params, const std::string & value) {
1551+ if (value == " on" || value == " enabled" ) {
1552+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553+ } else if (value == " off" || value == " disabled" ) {
1554+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555+ } else if (value == " auto" ) {
1556+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557+ } else {
1558+ throw std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n " , value.c_str ()));
1559+ }
15521560 }
15531561 ).set_env (" LLAMA_ARG_FLASH_ATTN" ));
15541562 add_opt (common_arg (
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593467 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ;
34603468 params.model .hf_file = " qwen2.5-coder-1.5b-q8_0.gguf" ;
34613469 params.port = 8012 ;
3462- params.n_gpu_layers = 99 ;
3463- params.flash_attn = true ;
34643470 params.n_ubatch = 1024 ;
34653471 params.n_batch = 1024 ;
34663472 params.n_ctx = 0 ;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753481 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ;
34763482 params.model .hf_file = " qwen2.5-coder-3b-q8_0.gguf" ;
34773483 params.port = 8012 ;
3478- params.n_gpu_layers = 99 ;
3479- params.flash_attn = true ;
34803484 params.n_ubatch = 1024 ;
34813485 params.n_batch = 1024 ;
34823486 params.n_ctx = 0 ;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913495 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
34923496 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
34933497 params.port = 8012 ;
3494- params.n_gpu_layers = 99 ;
3495- params.flash_attn = true ;
34963498 params.n_ubatch = 1024 ;
34973499 params.n_batch = 1024 ;
34983500 params.n_ctx = 0 ;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083510 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
35093511 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35103512 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3511- params.speculative .n_gpu_layers = 99 ;
35123513 params.port = 8012 ;
3513- params.n_gpu_layers = 99 ;
3514- params.flash_attn = true ;
35153514 params.n_ubatch = 1024 ;
35163515 params.n_batch = 1024 ;
35173516 params.n_ctx = 0 ;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273526 params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
35283527 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35293528 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3530- params.speculative .n_gpu_layers = 99 ;
35313529 params.port = 8012 ;
3532- params.n_gpu_layers = 99 ;
3533- params.flash_attn = true ;
35343530 params.n_ubatch = 1024 ;
35353531 params.n_batch = 1024 ;
35363532 params.n_ctx = 0 ;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35453541 params.model .hf_repo = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ;
35463542 params.model .hf_file = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" ;
35473543 params.port = 8012 ;
3548- params.n_gpu_layers = 99 ;
3549- params.flash_attn = true ;
35503544 params.n_ubatch = 1024 ;
35513545 params.n_batch = 1024 ;
35523546 params.n_ctx = 0 ;
0 commit comments