@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
11061106 printf (" \"\n\n " );
11071107
11081108 printf (" case \" $prev\" in\n " );
1109- printf (" --model)\n " );
1109+ printf (" --model|-m )\n " );
11101110 printf (" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \" $cur\" ) $(compgen -d -- \" $cur\" ) )\n " );
11111111 printf (" return 0\n " );
11121112 printf (" ;;\n " );
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15451545 }
15461546 ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
15471547 add_opt (common_arg (
1548- {" -fa" , " --flash-attn" },
1549- string_format (" enable Flash Attention (default: %s)" , params.flash_attn ? " enabled" : " disabled" ),
1550- [](common_params & params) {
1551- params.flash_attn = true ;
1548+ {" -fa" , " --flash-attn" }, " FA" ,
1549+ string_format (" set Flash Attention use ('on', 'off', or 'auto', default: '%s')" , llama_flash_attn_type_name (params.flash_attn_type )),
1550+ [](common_params & params, const std::string & value) {
1551+ if (value == " on" || value == " enabled" ) {
1552+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553+ } else if (value == " off" || value == " disabled" ) {
1554+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555+ } else if (value == " auto" ) {
1556+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557+ } else {
1558+ throw std::runtime_error (string_format (" error: unkown value for --flash-attn: '%s'\n " , value.c_str ()));
1559+ }
15521560 }
15531561 ).set_env (" LLAMA_ARG_FLASH_ATTN" ));
15541562 add_opt (common_arg (
@@ -2555,15 +2563,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25552563 {" --lora" }, " FNAME" ,
25562564 " path to LoRA adapter (can be repeated to use multiple adapters)" ,
25572565 [](common_params & params, const std::string & value) {
2558- params.lora_adapters .push_back ({ std::string (value), 1.0 , nullptr });
2566+ params.lora_adapters .push_back ({ std::string (value), 1.0 , " " , " " , nullptr });
25592567 }
25602568 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25612569 ).set_examples ({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
25622570 add_opt (common_arg (
25632571 {" --lora-scaled" }, " FNAME" , " SCALE" ,
25642572 " path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)" ,
25652573 [](common_params & params, const std::string & fname, const std::string & scale) {
2566- params.lora_adapters .push_back ({ fname, std::stof (scale), nullptr });
2574+ params.lora_adapters .push_back ({ fname, std::stof (scale), " " , " " , nullptr });
25672575 }
25682576 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
25692577 ).set_examples ({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -2954,20 +2962,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29542962 params.endpoint_metrics = true ;
29552963 }
29562964 ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_METRICS" ));
2957- add_opt (common_arg (
2958- {" --slots" },
2959- string_format (" enable slots monitoring endpoint (default: %s)" , params.endpoint_slots ? " enabled" : " disabled" ),
2960- [](common_params & params) {
2961- params.endpoint_slots = true ;
2962- }
2963- ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" ));
29642965 add_opt (common_arg (
29652966 {" --props" },
29662967 string_format (" enable changing global properties via POST /props (default: %s)" , params.endpoint_props ? " enabled" : " disabled" ),
29672968 [](common_params & params) {
29682969 params.endpoint_props = true ;
29692970 }
29702971 ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_PROPS" ));
2972+ add_opt (common_arg (
2973+ {" --slots" },
2974+ string_format (" enable slots monitoring endpoint (default: %s)" , params.endpoint_slots ? " enabled" : " disabled" ),
2975+ [](common_params & params) {
2976+ params.endpoint_slots = true ;
2977+ }
2978+ ).set_examples ({LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_ENDPOINT_SLOTS" ));
29712979 add_opt (common_arg (
29722980 {" --no-slots" },
29732981 " disables slots monitoring endpoint" ,
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34593467 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF" ;
34603468 params.model .hf_file = " qwen2.5-coder-1.5b-q8_0.gguf" ;
34613469 params.port = 8012 ;
3462- params.n_gpu_layers = 99 ;
3463- params.flash_attn = true ;
34643470 params.n_ubatch = 1024 ;
34653471 params.n_batch = 1024 ;
34663472 params.n_ctx = 0 ;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34753481 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF" ;
34763482 params.model .hf_file = " qwen2.5-coder-3b-q8_0.gguf" ;
34773483 params.port = 8012 ;
3478- params.n_gpu_layers = 99 ;
3479- params.flash_attn = true ;
34803484 params.n_ubatch = 1024 ;
34813485 params.n_batch = 1024 ;
34823486 params.n_ctx = 0 ;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34913495 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
34923496 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
34933497 params.port = 8012 ;
3494- params.n_gpu_layers = 99 ;
3495- params.flash_attn = true ;
34963498 params.n_ubatch = 1024 ;
34973499 params.n_batch = 1024 ;
34983500 params.n_ctx = 0 ;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35083510 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
35093511 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35103512 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3511- params.speculative .n_gpu_layers = 99 ;
35123513 params.port = 8012 ;
3513- params.n_gpu_layers = 99 ;
3514- params.flash_attn = true ;
35153514 params.n_ubatch = 1024 ;
35163515 params.n_batch = 1024 ;
35173516 params.n_ctx = 0 ;
@@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35273526 params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
35283527 params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
35293528 params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3530- params.speculative .n_gpu_layers = 99 ;
35313529 params.port = 8012 ;
3532- params.n_gpu_layers = 99 ;
3533- params.flash_attn = true ;
3530+ params.n_ubatch = 1024 ;
3531+ params.n_batch = 1024 ;
3532+ params.n_ctx = 0 ;
3533+ params.n_cache_reuse = 256 ;
3534+ }
3535+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
3536+
3537+ add_opt (common_arg (
3538+ {" --fim-qwen-30b-default" },
3539+ string_format (" use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)" ),
3540+ [](common_params & params) {
3541+ params.model .hf_repo = " ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" ;
3542+ params.model .hf_file = " qwen3-coder-30b-a3b-instruct-q8_0.gguf" ;
3543+ params.port = 8012 ;
35343544 params.n_ubatch = 1024 ;
35353545 params.n_batch = 1024 ;
35363546 params.n_ctx = 0 ;
0 commit comments