@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
764764 ).set_env (" LLAMA_ARG_CTX_SIZE" ));
765765 add_opt (common_arg (
766766 {" -n" , " --predict" , " --n-predict" }, " N" ,
767- string_format (" number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)" , params.n_predict ),
767+ string_format (
768+ ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769+ ? " number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770+ : " number of tokens to predict (default: %d, -1 = infinity)" ,
771+ params.n_predict ),
768772 [](common_params & params, int value) {
769773 params.n_predict = value;
770774 }
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
849853 }
850854 }
851855 ).set_excludes ({LLAMA_EXAMPLE_SERVER}));
856+ add_opt (common_arg (
857+ {" -sysf" , " --system-prompt-file" }, " FNAME" ,
858+ " a file containing the system prompt (default: none)" ,
859+ [](common_params & params, const std::string & value) {
860+ std::ifstream file (value);
861+ if (!file) {
862+ throw std::runtime_error (string_format (" error: failed to open file '%s'\n " , value.c_str ()));
863+ }
864+ std::copy (std::istreambuf_iterator<char >(file), std::istreambuf_iterator<char >(), back_inserter (params.system_prompt ));
865+ if (!params.system_prompt .empty () && params.system_prompt .back () == ' \n ' ) {
866+ params.system_prompt .pop_back ();
867+ }
868+ }
869+ ).set_examples ({LLAMA_EXAMPLE_MAIN}));
852870 add_opt (common_arg (
853871 {" --in-file" }, " FNAME" ,
854872 " an input file (repeat to specify multiple files)" ,
@@ -1867,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18671885 ).set_examples ({LLAMA_EXAMPLE_PASSKEY}));
18681886 add_opt (common_arg (
18691887 {" -o" , " --output" , " --output-file" }, " FNAME" ,
1870- string_format (" output file (default: '%s')" ,
1871- ex == LLAMA_EXAMPLE_EXPORT_LORA
1872- ? params.lora_outfile .c_str ()
1873- : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1874- ? params.cvector_outfile .c_str ()
1875- : params.out_file .c_str ()),
1888+ string_format (" output file (default: '%s')" , params.out_file .c_str ()),
18761889 [](common_params & params, const std::string & value) {
18771890 params.out_file = value;
1878- params.cvector_outfile = value;
1879- params.lora_outfile = value;
18801891 }
1881- ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892+ ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS }));
18821893 add_opt (common_arg (
18831894 {" -ofreq" , " --output-frequency" }, " N" ,
18841895 string_format (" output the imatrix every N iterations (default: %d)" , params.n_out_freq ),
@@ -2571,5 +2582,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25712582 }
25722583 ).set_examples ({LLAMA_EXAMPLE_SERVER}));
25732584
2585+ add_opt (common_arg (
2586+ {" --fim-qwen-7b-spec" },
2587+ string_format (" use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ),
2588+ [](common_params & params) {
2589+ params.hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
2590+ params.hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
2591+ params.speculative .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
2592+ params.speculative .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
2593+ params.speculative .n_gpu_layers = 99 ;
2594+ params.port = 8012 ;
2595+ params.n_gpu_layers = 99 ;
2596+ params.flash_attn = true ;
2597+ params.n_ubatch = 1024 ;
2598+ params.n_batch = 1024 ;
2599+ params.n_ctx = 0 ;
2600+ params.n_cache_reuse = 256 ;
2601+ }
2602+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2603+
2604+ add_opt (common_arg (
2605+ {" --fim-qwen-14b-spec" },
2606+ string_format (" use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)" ),
2607+ [](common_params & params) {
2608+ params.hf_repo = " ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF" ;
2609+ params.hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
2610+ params.speculative .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
2611+ params.speculative .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
2612+ params.speculative .n_gpu_layers = 99 ;
2613+ params.port = 8012 ;
2614+ params.n_gpu_layers = 99 ;
2615+ params.flash_attn = true ;
2616+ params.n_ubatch = 1024 ;
2617+ params.n_batch = 1024 ;
2618+ params.n_ctx = 0 ;
2619+ params.n_cache_reuse = 256 ;
2620+ }
2621+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2622+
25742623 return ctx_arg;
25752624}
0 commit comments