Skip to content

Commit f9f1585

Browse files
committed
broken merge - kcpp changes will be applied above this commit for better tracking.
2 parents 1012281 + 4b0c638 commit f9f1585

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+20327
-19897
lines changed

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,15 +1513,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15131513
{"--lora"}, "FNAME",
15141514
"path to LoRA adapter (can be repeated to use multiple adapters)",
15151515
[](common_params & params, const std::string & value) {
1516-
params.lora_adapters.push_back({ std::string(value), 1.0 });
1516+
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
15171517
}
15181518
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15191519
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
15201520
add_opt(common_arg(
15211521
{"--lora-scaled"}, "FNAME", "SCALE",
15221522
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
15231523
[](common_params & params, const std::string & fname, const std::string & scale) {
1524-
params.lora_adapters.push_back({ fname, std::stof(scale) });
1524+
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
15251525
}
15261526
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15271527
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));

common/common.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -891,9 +891,8 @@ struct common_init_result common_init_from_params(common_params & params) {
891891
}
892892

893893
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
894-
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
895-
llama_free_model(model);
896-
return iparams;
894+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
895+
params.ctx_shift = false;
897896
}
898897

899898
if (!params.control_vectors.empty()) {
@@ -924,20 +923,21 @@ struct common_init_result common_init_from_params(common_params & params) {
924923

925924
// load and optionally apply lora adapters
926925
for (auto & la : params.lora_adapters) {
927-
common_lora_adapter_container loaded_la;
928-
loaded_la.path = la.path;
929-
loaded_la.scale = la.scale;
930-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
931-
if (loaded_la.adapter == nullptr) {
926+
llama_lora_adapter_ptr lora;
927+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
928+
if (lora == nullptr) {
932929
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
933930
llama_free(lctx);
934931
llama_free_model(model);
935932
return iparams;
936933
}
937-
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
934+
935+
la.ptr = lora.get();
936+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
938937
}
938+
939939
if (!params.lora_init_without_apply) {
940-
common_lora_adapters_apply(lctx, iparams.lora_adapters);
940+
common_lora_adapters_apply(lctx, params.lora_adapters);
941941
}
942942

943943
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -998,17 +998,17 @@ struct common_init_result common_init_from_params(common_params & params) {
998998
llama_perf_context_reset(lctx);
999999
}
10001000

1001-
iparams.model = model;
1002-
iparams.context = lctx;
1001+
iparams.model.reset(model);
1002+
iparams.context.reset(lctx);
10031003

10041004
return iparams;
10051005
}
10061006

1007-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1007+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
10081008
llama_lora_adapter_clear(ctx);
1009-
for (auto & la : lora_adapters) {
1009+
for (auto & la : lora) {
10101010
if (la.scale != 0.0f) {
1011-
llama_lora_adapter_set(ctx, la.adapter, la.scale);
1011+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
10121012
}
10131013
}
10141014
}

common/common.h

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#pragma once
44

5-
#include "llama.h"
5+
#include "llama-cpp.h"
66

77
#include <string>
88
#include <vector>
@@ -27,10 +27,8 @@
2727
struct common_lora_adapter_info {
2828
std::string path;
2929
float scale;
30-
};
3130

32-
struct common_lora_adapter_container : common_lora_adapter_info {
33-
struct llama_lora_adapter * adapter;
31+
struct llama_lora_adapter * ptr;
3432
};
3533

3634
using llama_tokens = std::vector<llama_token>;
@@ -474,10 +472,12 @@ std::string fs_get_cache_file(const std::string & filename);
474472
// Model utils
475473
//
476474

475+
// note: defines object's lifetime
477476
struct common_init_result {
478-
struct llama_model * model = nullptr;
479-
struct llama_context * context = nullptr;
480-
std::vector<common_lora_adapter_container> lora_adapters;
477+
llama_model_ptr model;
478+
llama_context_ptr context;
479+
480+
std::vector<llama_lora_adapter_ptr> lora;
481481
};
482482

483483
struct common_init_result common_init_from_params(common_params & params);
@@ -499,7 +499,7 @@ struct llama_model * common_load_model_from_hf(
499499
const struct llama_model_params & params);
500500

501501
// clear LoRA adapters from context, then apply new list of adapters
502-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
502+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
503503

504504
//
505505
// Batch utils
@@ -636,6 +636,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
636636
// Split utils
637637
//
638638

639-
static const char * const LLM_KV_SPLIT_NO_STR = "split.no";
640-
static const char * const LLM_KV_SPLIT_COUNT_STR = "split.count";
641-
static const char * const LLM_KV_SPLIT_TENSORS_COUNT_STR = "split.tensors.count";
639+
namespace {
640+
641+
const char * const LLM_KV_SPLIT_NO = "split.no";
642+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
643+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
644+
645+
}

examples/gguf-split/gguf-split.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33
#include "build-info.h"
44

55
#include <algorithm>
6-
#include <cmath>
76
#include <cstdlib>
87
#include <fstream>
98
#include <string>
109
#include <vector>
11-
12-
#include <stdio.h>
13-
#include <string.h>
1410
#include <climits>
11+
12+
#include <cstdio>
13+
#include <cstring>
1514
#include <stdexcept>
1615

1716
#if defined(_WIN32)
@@ -231,9 +230,9 @@ struct split_strategy {
231230
if (i_split == 0) {
232231
gguf_set_kv(ctx_out, ctx_gguf);
233232
}
234-
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO_STR, i_split);
235-
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT_STR, 0); // placeholder
236-
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT_STR, n_tensors);
233+
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
234+
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
235+
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
237236
};
238237

239238
// initialize ctx_out for the first split

examples/main/main.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -146,18 +146,18 @@ int main(int argc, char ** argv) {
146146
llama_context * ctx = nullptr;
147147
common_sampler * smpl = nullptr;
148148

149-
std::vector<common_chat_msg> chat_msgs;
150-
151149
g_model = &model;
152150
g_ctx = &ctx;
153151
g_smpl = &smpl;
154152

153+
std::vector<common_chat_msg> chat_msgs;
154+
155155
// load the model and apply lora adapter, if any
156156
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
157157
common_init_result llama_init = common_init_from_params(params);
158158

159-
model = llama_init.model;
160-
ctx = llama_init.context;
159+
model = llama_init.model.get();
160+
ctx = llama_init.context.get();
161161

162162
if (model == NULL) {
163163
LOG_ERR("%s: error: unable to load model\n", __func__);
@@ -890,9 +890,6 @@ int main(int argc, char ** argv) {
890890

891891
common_sampler_free(smpl);
892892

893-
llama_free(ctx);
894-
llama_free_model(model);
895-
896893
llama_backend_free();
897894

898895
ggml_threadpool_free_fn(threadpool);

examples/server/server.cpp

Lines changed: 23 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ struct slot_params {
9898
int64_t t_max_prompt_ms = -1; // TODO: implement
9999
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
100100

101-
std::vector<common_lora_adapter_container> lora;
101+
std::vector<common_lora_adapter_info> lora;
102102

103103
std::vector<std::string> antiprompt;
104104
std::vector<std::string> response_fields;
@@ -198,15 +198,14 @@ struct server_task {
198198
bool metrics_reset_bucket = false;
199199

200200
// used by SERVER_TASK_TYPE_SET_LORA
201-
std::vector<common_lora_adapter_container> set_lora;
201+
std::vector<common_lora_adapter_info> set_lora;
202202

203203
server_task(server_task_type type) : type(type) {}
204204

205205
static slot_params params_from_json_cmpl(
206206
const llama_model * model,
207207
const llama_context * ctx,
208208
const common_params & params_base,
209-
const std::vector<common_lora_adapter_container> & lora_base,
210209
const json & data) {
211210
slot_params params;
212211

@@ -265,12 +264,12 @@ struct server_task {
265264

266265
if (data.contains("lora")) {
267266
if (data.at("lora").is_array()) {
268-
params.lora = parse_lora_request(lora_base, data.at("lora"));
267+
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
269268
} else {
270269
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
271270
}
272271
} else {
273-
params.lora = lora_base;
272+
params.lora = params_base.lora_adapters;
274273
}
275274

276275
// TODO: add more sanity checks for the input parameters
@@ -1132,7 +1131,7 @@ struct server_slot {
11321131

11331132
common_speculative * spec = nullptr;
11341133

1135-
std::vector<common_lora_adapter_container> lora;
1134+
std::vector<common_lora_adapter_info> lora;
11361135

11371136
// the index relative to completion multi-task request
11381137
size_t index = 0;
@@ -1627,11 +1626,15 @@ struct server_response {
16271626
struct server_context {
16281627
common_params params_base;
16291628

1629+
// note: keep these alive - they determine the lifetime of the model, context, etc.
1630+
common_init_result llama_init;
1631+
common_init_result llama_init_dft;
1632+
16301633
llama_model * model = nullptr;
16311634
llama_context * ctx = nullptr;
1632-
std::vector<common_lora_adapter_container> lora;
16331635

16341636
llama_model * model_dft = nullptr;
1637+
16351638
llama_context_params cparams_dft;
16361639

16371640
llama_batch batch = {};
@@ -1655,21 +1658,6 @@ struct server_context {
16551658
float slot_prompt_similarity = 0.0f;
16561659

16571660
~server_context() {
1658-
if (ctx) {
1659-
llama_free(ctx);
1660-
ctx = nullptr;
1661-
}
1662-
1663-
if (model) {
1664-
llama_free_model(model);
1665-
model = nullptr;
1666-
}
1667-
1668-
if (model_dft) {
1669-
llama_free_model(model_dft);
1670-
model_dft = nullptr;
1671-
}
1672-
16731661
// Clear any sampling context
16741662
for (server_slot & slot : slots) {
16751663
common_sampler_free(slot.smpl);
@@ -1692,11 +1680,10 @@ struct server_context {
16921680

16931681
params_base = params;
16941682

1695-
common_init_result llama_init = common_init_from_params(params_base);
1683+
llama_init = common_init_from_params(params_base);
16961684

1697-
model = llama_init.model;
1698-
ctx = llama_init.context;
1699-
lora = llama_init.lora_adapters;
1685+
model = llama_init.model.get();
1686+
ctx = llama_init.context.get();
17001687

17011688
if (model == nullptr) {
17021689
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
@@ -1719,35 +1706,29 @@ struct server_context {
17191706
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
17201707
params_dft.n_parallel = 1;
17211708

1722-
common_init_result llama_init_dft = common_init_from_params(params_dft);
1709+
llama_init_dft = common_init_from_params(params_dft);
17231710

1724-
model_dft = llama_init_dft.model;
1711+
model_dft = llama_init_dft.model.get();
17251712

17261713
if (model_dft == nullptr) {
17271714
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
17281715
return false;
17291716
}
17301717

1731-
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
1718+
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
17321719
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
17331720

1734-
llama_free (llama_init_dft.context);
1735-
llama_free_model(llama_init_dft.model);
1736-
17371721
return false;
17381722
}
17391723

1740-
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
1724+
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
17411725

17421726
cparams_dft = common_context_params_to_llama(params_dft);
17431727
cparams_dft.n_batch = n_ctx_dft;
17441728

17451729
// force F16 KV cache for the draft model for extra performance
17461730
cparams_dft.type_k = GGML_TYPE_F16;
17471731
cparams_dft.type_v = GGML_TYPE_F16;
1748-
1749-
// the context is not needed - we will create one for each slot
1750-
llama_free(llama_init_dft.context);
17511732
}
17521733

17531734
return true;
@@ -1898,7 +1879,7 @@ struct server_context {
18981879
if (!are_lora_equal(task.params.lora, slot.lora)) {
18991880
// if lora is changed, we cannot reuse cached tokens
19001881
slot.cache_tokens.clear();
1901-
slot.lora = std::move(task.params.lora);
1882+
slot.lora = task.params.lora;
19021883
}
19031884

19041885
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
@@ -2592,7 +2573,7 @@ struct server_context {
25922573
} break;
25932574
case SERVER_TASK_TYPE_SET_LORA:
25942575
{
2595-
lora = std::move(task.set_lora);
2576+
params_base.lora_adapters = std::move(task.set_lora);
25962577
auto res = std::make_unique<server_task_result_apply_lora>();
25972578
res->id = task.id;
25982579
queue_results.send(std::move(res));
@@ -3671,7 +3652,6 @@ int main(int argc, char ** argv) {
36713652
ctx_server.model,
36723653
ctx_server.ctx,
36733654
ctx_server.params_base,
3674-
ctx_server.lora,
36753655
data);
36763656
task.id_selected_slot = json_value(data, "id_slot", -1);
36773657

@@ -4098,8 +4078,9 @@ int main(int argc, char ** argv) {
40984078

40994079
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
41004080
json result = json::array();
4101-
for (size_t i = 0; i < ctx_server.lora.size(); ++i) {
4102-
auto & lora = ctx_server.lora[i];
4081+
const auto & loras = ctx_server.params_base.lora_adapters;
4082+
for (size_t i = 0; i < loras.size(); ++i) {
4083+
auto & lora = loras[i];
41034084
result.push_back({
41044085
{"id", i},
41054086
{"path", lora.path},
@@ -4118,7 +4099,7 @@ int main(int argc, char ** argv) {
41184099
}
41194100
server_task task(SERVER_TASK_TYPE_SET_LORA);
41204101
task.id = ctx_server.queue_tasks.get_new_id();
4121-
task.set_lora = parse_lora_request(ctx_server.lora, body);
4102+
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
41224103
ctx_server.queue_results.add_waiting_task_id(task.id);
41234104
ctx_server.queue_tasks.post(task);
41244105

0 commit comments

Comments
 (0)