@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1545
1545
}
1546
1546
).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547
1547
add_opt (common_arg (
1548
- {" -fa" , " --flash-attn" },
1549
- string_format (" enable Flash Attention (default: %s)" , params.flash_attn ? " enabled" : " disabled" ),
1550
- [](common_params & params) {
1551
- params.flash_attn = true ;
1548
+ {" -fa" , " --flash-attn" }, " FA" ,
1549
+ string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" , llama_flash_attn_type_name (params.flash_attn_type )),
1550
+ [](common_params & params, const std::string & value) {
1551
+ if (value == " on" || value == " enabled" ) {
1552
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553
+ } else if (value == " off" || value == " disabled" ) {
1554
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555
+ } else if (value == " auto" ) {
1556
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557
+ } else {
1558
+ throw std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n " , value.c_str ()));
1559
+ }
1552
1560
}
1553
1561
).set_env (" LLAMA_ARG_FLASH_ATTN" ));
1554
1562
add_opt (common_arg (
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3459
3467
params.model .hf_repo = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ;
3460
3468
params.model .hf_file = " qwen2.5-coder-1.5b-q8_0.gguf" ;
3461
3469
params.port = 8012 ;
3462
- params.n_gpu_layers = 99 ;
3463
- params.flash_attn = true ;
3464
3470
params.n_ubatch = 1024 ;
3465
3471
params.n_batch = 1024 ;
3466
3472
params.n_ctx = 0 ;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3475
3481
params.model .hf_repo = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ;
3476
3482
params.model .hf_file = " qwen2.5-coder-3b-q8_0.gguf" ;
3477
3483
params.port = 8012 ;
3478
- params.n_gpu_layers = 99 ;
3479
- params.flash_attn = true ;
3480
3484
params.n_ubatch = 1024 ;
3481
3485
params.n_batch = 1024 ;
3482
3486
params.n_ctx = 0 ;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3491
3495
params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
3492
3496
params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
3493
3497
params.port = 8012 ;
3494
- params.n_gpu_layers = 99 ;
3495
- params.flash_attn = true ;
3496
3498
params.n_ubatch = 1024 ;
3497
3499
params.n_batch = 1024 ;
3498
3500
params.n_ctx = 0 ;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3508
3510
params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
3509
3511
params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3510
3512
params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3511
- params.speculative .n_gpu_layers = 99 ;
3512
3513
params.port = 8012 ;
3513
- params.n_gpu_layers = 99 ;
3514
- params.flash_attn = true ;
3515
3514
params.n_ubatch = 1024 ;
3516
3515
params.n_batch = 1024 ;
3517
3516
params.n_ctx = 0 ;
@@ -3527,10 +3526,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3527
3526
params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
3528
3527
params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3529
3528
params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3530
- params.speculative .n_gpu_layers = 99 ;
3531
3529
params.port = 8012 ;
3532
- params.n_gpu_layers = 99 ;
3533
- params.flash_attn = true ;
3534
3530
params.n_ubatch = 1024 ;
3535
3531
params.n_batch = 1024 ;
3536
3532
params.n_ctx = 0 ;
@@ -3545,8 +3541,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3545
3541
params.model .hf_repo = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ;
3546
3542
params.model .hf_file = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" ;
3547
3543
params.port = 8012 ;
3548
- params.n_gpu_layers = 99 ;
3549
- params.flash_attn = true ;
3550
3544
params.n_ubatch = 1024 ;
3551
3545
params.n_batch = 1024 ;
3552
3546
params.n_ctx = 0 ;
0 commit comments