@@ -1547,18 +1547,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15471547 }
15481548 ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15491549 add_opt (common_arg (
1550- {" -fa" , " --flash-attn" }, " FA" ,
1551- string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" , llama_flash_attn_type_name (params.flash_attn_type )),
1552- [](common_params & params, const std::string & value) {
1553- if (value == " on" || value == " enabled" ) {
1554- params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1555- } else if (value == " off" || value == " disabled" ) {
1556- params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1557- } else if (value == " auto" ) {
1558- params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1559- } else {
1560- throw std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n " , value.c_str ()));
1561- }
1550+ {" -fa" , " --flash-attn" },
1551+ string_format (" enable Flash Attention (default: %s)" , params.flash_attn ? " enabled" : " disabled" ),
1552+ [](common_params & params) {
1553+ params.flash_attn = true ;
15621554 }
15631555 ).set_env (" LLAMA_ARG_FLASH_ATTN" ));
15641556 add_opt (common_arg (
@@ -3469,6 +3461,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34693461 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ;
34703462 params.model .hf_file = " qwen2.5-coder-1.5b-q8_0.gguf" ;
34713463 params.port = 8012 ;
3464+ params.n_gpu_layers = 99 ;
3465+ params.flash_attn = true ;
34723466 params.n_ubatch = 1024 ;
34733467 params.n_batch = 1024 ;
34743468 params.n_ctx = 0 ;
@@ -3483,6 +3477,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34833477 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ;
34843478 params.model .hf_file = " qwen2.5-coder-3b-q8_0.gguf" ;
34853479 params.port = 8012 ;
3480+ params.n_gpu_layers = 99 ;
3481+ params.flash_attn = true ;
34863482 params.n_ubatch = 1024 ;
34873483 params.n_batch = 1024 ;
34883484 params.n_ctx = 0 ;
@@ -3497,6 +3493,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34973493 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
34983494 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
34993495 params.port = 8012 ;
3496+ params.n_gpu_layers = 99 ;
3497+ params.flash_attn = true ;
35003498 params.n_ubatch = 1024 ;
35013499 params.n_batch = 1024 ;
35023500 params.n_ctx = 0 ;
@@ -3512,7 +3510,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35123510 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
35133511 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35143512 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3513+ params.speculative .n_gpu_layers = 99 ;
35153514 params.port = 8012 ;
3515+ params.n_gpu_layers = 99 ;
3516+ params.flash_attn = true ;
35163517 params.n_ubatch = 1024 ;
35173518 params.n_batch = 1024 ;
35183519 params.n_ctx = 0 ;
@@ -3528,7 +3529,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35283529 params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
35293530 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35303531 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3532+ params.speculative .n_gpu_layers = 99 ;
35313533 params.port = 8012 ;
3534+ params.n_gpu_layers = 99 ;
3535+ params.flash_attn = true ;
35323536 params.n_ubatch = 1024 ;
35333537 params.n_batch = 1024 ;
35343538 params.n_ctx = 0 ;
@@ -3543,6 +3547,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35433547 params.model .hf_repo = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ;
35443548 params.model .hf_file = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" ;
35453549 params.port = 8012 ;
3550+ params.n_gpu_layers = 99 ;
3551+ params.flash_attn = true ;
35463552 params.n_ubatch = 1024 ;
35473553 params.n_batch = 1024 ;
35483554 params.n_ctx = 0 ;
0 commit comments