@@ -751,6 +751,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
751751// utils
752752//
753753
754+ // Helper function to parse tensor buffer override strings
755+ static void parse_tensor_buffer_overrides (const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
756+ std::map<std::string, ggml_backend_buffer_type_t > buft_list;
757+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
758+ auto * dev = ggml_backend_dev_get (i);
759+ auto * buft = ggml_backend_dev_buffer_type (dev);
760+ if (buft) {
761+ buft_list[ggml_backend_buft_name (buft)] = buft;
762+ }
763+ }
764+
765+ for (const auto & override : string_split<std::string>(value, ' ,' )) {
766+ std::string::size_type pos = override .find (' =' );
767+ if (pos == std::string::npos) {
768+ throw std::invalid_argument (" invalid value" );
769+ }
770+ std::string tensor_name = override .substr (0 , pos);
771+ std::string buffer_type = override .substr (pos + 1 );
772+
773+ if (buft_list.find (buffer_type) == buft_list.end ()) {
774+ printf (" Available buffer types:\n " );
775+ for (const auto & it : buft_list) {
776+ printf (" %s\n " , ggml_backend_buft_name (it.second ));
777+ }
778+ throw std::invalid_argument (" unknown buffer type" );
779+ }
780+ // keep strings alive and avoid leaking memory by storing them in a static vector
781+ static std::list<std::string> buft_overrides;
782+ buft_overrides.push_back (tensor_name);
783+ overrides.push_back ({buft_overrides.back ().c_str (), buft_list.at (buffer_type)});
784+ }
785+ }
786+
754787struct handle_model_result {
755788 bool found_mmproj = false ;
756789 common_params_model mmproj;
@@ -995,6 +1028,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
9951028 params.tensor_buft_overrides .push_back ({nullptr , nullptr });
9961029 }
9971030
1031+ if (!params.speculative .tensor_buft_overrides .empty ()) {
1032+ params.speculative .tensor_buft_overrides .push_back ({nullptr , nullptr });
1033+ }
1034+
9981035 if (!params.chat_template .empty () && !common_chat_verify_template (params.chat_template , params.use_jinja )) {
9991036 throw std::runtime_error (string_format (
10001037 " error: the supplied chat template is not supported: %s%s\n " ,
@@ -1203,6 +1240,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
12031240 common_params_print_completion (ctx_arg);
12041241 exit (0 );
12051242 }
1243+ params.lr .init ();
12061244 } catch (const std::invalid_argument & ex) {
12071245 fprintf (stderr, " %s\n " , ex.what ());
12081246 ctx_arg.params = params_org;
@@ -1471,6 +1509,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14711509 params.swa_full = true ;
14721510 }
14731511 ).set_env (" LLAMA_ARG_SWA_FULL" ));
1512+ add_opt (common_arg (
1513+ {" --swa-checkpoints" }, " N" ,
1514+ string_format (" max number of SWA checkpoints per slot to create (default: %d)\n "
1515+ " [(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)" , params.n_swa_checkpoints ),
1516+ [](common_params & params, int value) {
1517+ params.n_swa_checkpoints = value;
1518+ }
1519+ ).set_env (" LLAMA_ARG_SWA_CHECKPOINTS" ).set_examples ({LLAMA_EXAMPLE_SERVER}));
14741520 add_opt (common_arg (
14751521 {" --kv-unified" , " -kvu" },
14761522 string_format (" use single unified KV buffer for the KV cache of all sequences (default: %s)\n "
@@ -2351,40 +2397,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23512397 add_opt (common_arg (
23522398 {" --override-tensor" , " -ot" }, " <tensor name pattern>=<buffer type>,..." ,
23532399 " override tensor buffer type" , [](common_params & params, const std::string & value) {
2354- /* static */ std::map<std::string, ggml_backend_buffer_type_t > buft_list;
2355- if (buft_list.empty ()) {
2356- // enumerate all the devices and add their buffer types to the list
2357- for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
2358- auto * dev = ggml_backend_dev_get (i);
2359- auto * buft = ggml_backend_dev_buffer_type (dev);
2360- if (buft) {
2361- buft_list[ggml_backend_buft_name (buft)] = buft;
2362- }
2363- }
2364- }
2365-
2366- for (const auto & override : string_split<std::string>(value, ' ,' )) {
2367- std::string::size_type pos = override .find (' =' );
2368- if (pos == std::string::npos) {
2369- throw std::invalid_argument (" invalid value" );
2370- }
2371- std::string tensor_name = override .substr (0 , pos);
2372- std::string buffer_type = override .substr (pos + 1 );
2373-
2374- if (buft_list.find (buffer_type) == buft_list.end ()) {
2375- printf (" Available buffer types:\n " );
2376- for (const auto & it : buft_list) {
2377- printf (" %s\n " , ggml_backend_buft_name (it.second ));
2378- }
2379- throw std::invalid_argument (" unknown buffer type" );
2380- }
2381- // keep strings alive and avoid leaking memory by storing them in a static vector
2382- static std::list<std::string> buft_overrides;
2383- buft_overrides.push_back (tensor_name);
2384- params.tensor_buft_overrides .push_back ({buft_overrides.back ().c_str (), buft_list.at (buffer_type)});
2385- }
2400+ parse_tensor_buffer_overrides (value, params.tensor_buft_overrides );
23862401 }
23872402 ));
2403+ add_opt (common_arg (
2404+ {" --override-tensor-draft" , " -otd" }, " <tensor name pattern>=<buffer type>,..." ,
2405+ " override tensor buffer type for draft model" , [](common_params & params, const std::string & value) {
2406+ parse_tensor_buffer_overrides (value, params.speculative .tensor_buft_overrides );
2407+ }
2408+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
23882409 add_opt (common_arg (
23892410 {" --cpu-moe" , " -cmoe" },
23902411 " keep all Mixture of Experts (MoE) weights in the CPU" ,
@@ -2407,6 +2428,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24072428 }
24082429 }
24092430 ).set_env (" LLAMA_ARG_N_CPU_MOE" ));
2431+ add_opt (common_arg (
2432+ {" --cpu-moe-draft" , " -cmoed" },
2433+ " keep all Mixture of Experts (MoE) weights in the CPU for the draft model" ,
2434+ [](common_params & params) {
2435+ params.speculative .tensor_buft_overrides .push_back ({" \\ .ffn_(up|down|gate)_exps" , ggml_backend_cpu_buffer_type ()});
2436+ }
2437+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_CPU_MOE_DRAFT" ));
2438+ add_opt (common_arg (
2439+ {" --n-cpu-moe-draft" , " -ncmoed" }, " N" ,
2440+ " keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model" ,
2441+ [](common_params & params, int value) {
2442+ if (value < 0 ) {
2443+ throw std::invalid_argument (" invalid value" );
2444+ }
2445+ for (int i = 0 ; i < value; ++i) {
2446+ static std::list<std::string> buft_overrides_draft;
2447+ buft_overrides_draft.push_back (string_format (" blk\\ .%d\\ .ffn_(up|down|gate)_exps" , i));
2448+ params.speculative .tensor_buft_overrides .push_back ({buft_overrides_draft.back ().c_str (), ggml_backend_cpu_buffer_type ()});
2449+ }
2450+ }
2451+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_N_CPU_MOE_DRAFT" ));
24102452 add_opt (common_arg (
24112453 {" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
24122454 " number of layers to store in VRAM" ,
@@ -2657,7 +2699,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26572699 [](common_params & params, const std::string & value) {
26582700 params.out_file = value;
26592701 }
2660- ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
2702+ ).set_examples ({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE }));
26612703 add_opt (common_arg (
26622704 {" -ofreq" , " --output-frequency" }, " N" ,
26632705 string_format (" output the imatrix every N iterations (default: %d)" , params.n_out_freq ),
@@ -3132,7 +3174,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31323174 params.speculative .cpuparams .n_threads = std::thread::hardware_concurrency ();
31333175 }
31343176 }
3135- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
3177+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
31363178 add_opt (common_arg (
31373179 {" -tbd" , " --threads-batch-draft" }, " N" ,
31383180 " number of threads to use during batch and prompt processing (default: same as --threads-draft)" ,
@@ -3142,7 +3184,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31423184 params.speculative .cpuparams_batch .n_threads = std::thread::hardware_concurrency ();
31433185 }
31443186 }
3145- ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE}));
3187+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER }));
31463188 add_opt (common_arg (
31473189 {" -Cd" , " --cpu-mask-draft" }, " M" ,
31483190 " Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)" ,
@@ -3535,5 +3577,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
35353577 ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
35363578
35373579
3580+ add_opt (
3581+ common_arg ({ " -lr" , " --learning-rate" }, " ALPHA" ,
3582+ string_format (
3583+ " adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)" ,
3584+ (double ) params.lr .lr0 ),
3585+ [](common_params & params, const std::string & value) { params.lr .lr0 = std::stof (value); })
3586+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3587+ add_opt (
3588+ common_arg ({ " -lr-min" , " --learning-rate-min" }, " ALPHA" ,
3589+ string_format (
3590+ " (if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)" ,
3591+ (double ) params.lr .lr_min ),
3592+ [](common_params & params, const std::string & value) { params.lr .lr_min = std::stof (value); })
3593+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3594+ add_opt (
3595+ common_arg ({ " -decay-epochs" , " --learning-rate-decay-epochs" }, " ALPHA" ,
3596+ string_format (
3597+ " (if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)" ,
3598+ (double ) params.lr .decay_epochs ),
3599+ [](common_params & params, const std::string & value) { params.lr .decay_epochs = std::stof (value); })
3600+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3601+ add_opt (common_arg (
3602+ { " -wd" , " --weight-decay" }, " WD" ,
3603+ string_format (
3604+ " adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g)." ,
3605+ (double ) params.lr .wd ),
3606+ [](common_params & params, const std::string & value) { params.lr .wd = std::stof (value); })
3607+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3608+ add_opt (common_arg ({ " -val-split" , " --val-split" }, " FRACTION" ,
3609+ string_format (" fraction of data to use as validation set for training (default: %.2g)." ,
3610+ (double ) params.val_split ),
3611+ [](common_params & params, const std::string & value) { params.val_split = std::stof (value); })
3612+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3613+ add_opt (common_arg ({ " -epochs" , " --epochs" }, " N" ,
3614+ string_format (" optimizer max # of epochs (default: %d)" , params.lr .epochs ),
3615+ [](common_params & params, int epochs) { params.lr .epochs = epochs; })
3616+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3617+ add_opt (common_arg ({ " -opt" , " --optimizer" }, " sgd|adamw" , " adamw or sgd" ,
3618+ [](common_params & params, const std::string & name) {
3619+ params.optimizer = common_opt_get_optimizer (name.c_str ());
3620+ if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3621+ throw std::invalid_argument (" invalid --optimizer, valid options: adamw, sgd" );
3622+ }
3623+ })
3624+ .set_examples ({ LLAMA_EXAMPLE_FINETUNE }));
3625+
35383626 return ctx_arg;
35393627}
0 commit comments