@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
684684 }
685685 if (arg == " --lora" ) {
686686 CHECK_ARG
687- params.lora_adapter .emplace_back (argv[i], 1 .0f );
687+ params.lora_adapters .push_back ({
688+ std::string (argv[i]),
689+ 1.0 ,
690+ });
688691 return true ;
689692 }
690693 if (arg == " --lora-scaled" ) {
691694 CHECK_ARG
692- const char * lora_adapter = argv[i];
695+ std::string lora_adapter = argv[i];
693696 CHECK_ARG
694- params.lora_adapter .emplace_back (lora_adapter, std::stof (argv[i]));
697+ params.lora_adapters .push_back ({
698+ lora_adapter,
699+ std::stof (argv[i]),
700+ });
701+ return true ;
702+ }
703+ if (arg == " --lora-init-without-apply" ) {
704+ params.lora_init_without_apply = true ;
695705 return true ;
696706 }
697707 if (arg == " --control-vector" ) {
@@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16541664 " https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16551665 options.push_back ({ " server" , " -sps, --slot-prompt-similarity SIMILARITY" ,
16561666 " how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n " , params.slot_prompt_similarity });
1667+ options.push_back ({ " server" , " --lora-init-without-apply" , " load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)" , params.lora_init_without_apply ? " enabled" : " disabled" });
16571668
16581669#ifndef LOG_DISABLE_LOGS
16591670 options.push_back ({ " logging" });
@@ -2091,17 +2102,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
20912102 }
20922103 }
20932104
2094- for (unsigned int i = 0 ; i < params.lora_adapter .size (); ++i) {
2095- const std::string & lora_adapter = std::get<0 >(params.lora_adapter [i]);
2096- float lora_scale = std::get<1 >(params.lora_adapter [i]);
2097- auto adapter = llama_lora_adapter_init (model, lora_adapter.c_str ());
2098- if (adapter == nullptr ) {
2099- fprintf (stderr, " %s: error: failed to apply lora adapter\n " , __func__);
2105+ // load and optionally apply lora adapters
2106+ for (auto & la : params.lora_adapters ) {
2107+ llama_lora_adapter_container loaded_la;
2108+ loaded_la.path = la.path ;
2109+ loaded_la.scale = la.scale ;
2110+ loaded_la.adapter = llama_lora_adapter_init (model, la.path .c_str ());
2111+ if (loaded_la.adapter == nullptr ) {
2112+ fprintf (stderr, " %s: error: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
21002113 llama_free (lctx);
21012114 llama_free_model (model);
21022115 return iparams;
21032116 }
2104- llama_lora_adapter_set (lctx, adapter, lora_scale);
2117+ iparams.lora_adapters .push_back (loaded_la); // copy to list of loaded adapters
2118+ }
2119+ if (!params.lora_init_without_apply ) {
2120+ llama_lora_adapters_apply (lctx, iparams.lora_adapters );
21052121 }
21062122
21072123 if (params.ignore_eos ) {
@@ -2140,6 +2156,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
21402156 return iparams;
21412157}
21422158
2159+ void llama_lora_adapters_apply (struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2160+ llama_lora_adapter_clear (ctx);
2161+ for (auto & la : lora_adapters) {
2162+ if (la.scale != 0 .0f ) {
2163+ llama_lora_adapter_set (ctx, la.adapter , la.scale );
2164+ }
2165+ }
2166+ }
2167+
21432168struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params) {
21442169 auto mparams = llama_model_default_params ();
21452170
@@ -3162,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31623187 }
31633188
31643189 fprintf (stream, " lora:\n " );
3165- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3166- if (std::get< 1 >(la) ! = 1 .0f ) {
3167- continue ;
3190+ for (auto & la : params.lora_adapters ) {
3191+ if (la. scale = = 1 .0f ) {
3192+ fprintf (stream, " - %s \n " , la. path . c_str ()) ;
31683193 }
3169- fprintf (stream, " - %s\n " , std::get<0 >(la).c_str ());
31703194 }
31713195 fprintf (stream, " lora_scaled:\n " );
3172- for (std::tuple<std::string, float > la : params.lora_adapter ) {
3173- if (std::get< 1 >(la) = = 1 .0f ) {
3174- continue ;
3196+ for (auto & la : params.lora_adapters ) {
3197+ if (la. scale ! = 1 .0f ) {
3198+ fprintf (stream, " - %s: %f \n " , la. path . c_str (), la. scale ) ;
31753199 }
3176- fprintf (stream, " - %s: %f\n " , std::get<0 >(la).c_str (), std::get<1 >(la));
31773200 }
3201+ fprintf (stream, " lora_init_without_apply: %s # default: false\n " , params.lora_init_without_apply ? " true" : " false" );
31783202 fprintf (stream, " main_gpu: %d # default: 0\n " , params.main_gpu );
31793203 fprintf (stream, " min_keep: %d # default: 0 (disabled)\n " , sparams.min_keep );
31803204 fprintf (stream, " mirostat: %d # default: 0 (disabled)\n " , sparams.mirostat );
0 commit comments