From b5c3eaa19f7e2a789dcc56fa92874700f0c1cefe Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 14 Jul 2025 14:48:57 +0100 Subject: [PATCH 1/4] Added `--lora-layer-range` option --- common/arg.cpp | 8 ++++++++ common/common.cpp | 27 ++++++++++++++++----------- common/common.h | 2 ++ include/llama.h | 5 ++++- src/llama-adapter.cpp | 31 ++++++++++++++++++++++++++++--- 5 files changed, 58 insertions(+), 15 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 56827a65908be..cc4aab8dd641e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2472,6 +2472,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(common_arg( + {"--lora-layer-range"}, "START", "END", + "layer range to apply the lora(s) to, start and end inclusive", + [](common_params & params, const std::string & start, const std::string & end) { + params.lora_layer_start = std::stoi(start); + params.lora_layer_end = std::stoi(end); + } + )); add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", diff --git a/common/common.cpp b/common/common.cpp index e4e71ad13fb59..c3613b06eef89 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -982,18 +982,23 @@ struct common_init_result common_init_from_params(common_params & params) { } // load and optionally apply lora adapters - for (auto & la : params.lora_adapters) { - llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); - if (lora == nullptr) { - LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_model_free(model); - return iparams; - } + if (!params.lora_adapters.empty()) { + if (params.lora_layer_start <= 0) params.lora_layer_start = 1; + if (params.lora_layer_end <= 0) params.lora_layer_end = llama_model_n_layer(model); + + for (auto & la : params.lora_adapters) { + llama_adapter_lora_ptr lora; + lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + if (lora == nullptr) { + LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + llama_free(lctx); + llama_model_free(model); + return iparams; + } - la.ptr = lora.get(); - iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + la.ptr = lora.get(); + iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + } } if (!params.lora_init_without_apply) { diff --git a/common/common.h b/common/common.h index a5abe32859fdd..5db658ba3f913 100644 --- a/common/common.h +++ b/common/common.h @@ -296,6 +296,8 @@ struct common_params { int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector + int32_t lora_layer_start = -1; // layer range for lora + int32_t lora_layer_end = -1; // layer range for lora bool offline = false; int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. diff --git a/include/llama.h b/include/llama.h index f73b1ab65fe6f..778db32364145 100644 --- a/include/llama.h +++ b/include/llama.h @@ -544,9 +544,12 @@ extern "C" { // // Load a LoRA adapter from file + // il_start and il_end are the layer range the lora should apply to (both inclusive) LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, - const char * path_lora); + const char * path_lora, + int32_t il_start, + int32_t il_end); // Manually free a LoRA adapter // Note: loaded adapters will be free when the associated model is deleted diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8d94034aed95d..928e9b219d27d 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -145,7 +145,12 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { return nullptr; } -static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl( + llama_model & model, + const char * path_lora, + llama_adapter_lora & adapter, + int32_t il_start, + int32_t il_end) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; @@ -224,6 +229,22 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) { std::string name(cur->name); + + // check if this tensor has a layer number and is outside our range + size_t blk_pos = name.find("blk."); + if (blk_pos != std::string::npos) { + size_t start = blk_pos + 4; // skip "blk." + size_t end = name.find('.', start); + try { + int layer_num = std::stoi(name.substr(start, end - start)); + if (layer_num < il_start || layer_num > il_end) { + continue; // skip this tensor + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to parse layer number from tensor '%s': %s\n", __func__, name.c_str(), err.what()); + } + } + if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); if (ab_map.find(name) == ab_map.end()) { @@ -368,11 +389,15 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } -llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { +llama_adapter_lora * llama_adapter_lora_init( + llama_model * model, + const char * path_lora, + int32_t il_start, + int32_t il_end) { llama_adapter_lora * adapter = new llama_adapter_lora(); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(*model, path_lora, *adapter, il_start, il_end); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); From 25da381e39b3cd9f7cc03f721856a1583edcea72 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 14 Jul 2025 14:52:47 +0100 Subject: [PATCH 2/4] Added missing args to `llama_adapter_lora_init` call --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index c3613b06eef89..b8714310617a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -988,7 +988,7 @@ struct common_init_result common_init_from_params(common_params & params) { for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + lora.reset(llama_adapter_lora_init(model, la.path.c_str(), params.lora_layer_start, params.lora_layer_end)); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); From 7828e4f0f73199200f64bb671c76bea1bf2c69b4 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 14 Jul 2025 15:27:25 +0100 Subject: [PATCH 3/4] Fixed lower end of range as LoRAs can be applied to layer 0 --- common/common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b8714310617a5..069cdb0434de8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -983,8 +983,8 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters if (!params.lora_adapters.empty()) { - if (params.lora_layer_start <= 0) params.lora_layer_start = 1; - if (params.lora_layer_end <= 0) params.lora_layer_end = llama_model_n_layer(model); + if (params.lora_layer_start < 0) params.lora_layer_start = 0; + if (params.lora_layer_end < 0) params.lora_layer_end = llama_model_n_layer(model); for (auto & la : params.lora_adapters) { llama_adapter_lora_ptr lora; From 71f8b75d9b686a5e4a11f2067c7846c245a158f9 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 14 Jul 2025 15:32:58 +0100 Subject: [PATCH 4/4] Updated the `README.md` for `llama-server` --- tools/server/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/README.md b/tools/server/README.md index 6f962664f6774..ee7a52702b6e2 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -82,6 +82,7 @@ The project is under active development, and we are [looking for feedback and co | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | +| `--lora-layer-range START END` | layer range to apply the LoRA(s) to, start and end inclusive | | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |