44#include " llama-mmap.h"
55#include " llama-model.h"
66
7- #include < algorithm>
87#include < map>
98#include < cassert>
109#include < stdexcept>
1110
1211// vec
1312
14- struct ggml_tensor * llama_adapter_cvec::tensor_for (int il) const {
13+ ggml_tensor * llama_adapter_cvec::tensor_for (int il) const {
1514 if (il < 0 || il < layer_start || il > layer_end || (size_t ) il >= tensors.size ()) {
1615 return nullptr ;
1716 }
1817
1918 return tensors[il];
2019}
2120
22- struct ggml_tensor * llama_adapter_cvec::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21+ ggml_tensor * llama_adapter_cvec::apply_to (ggml_context * ctx, ggml_tensor * cur, int il) const {
2322 ggml_tensor * layer_dir = tensor_for (il);
2423 if (layer_dir != nullptr ) {
2524 cur = ggml_add (ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
4039 auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
4140 auto it = ctx_map.find (buft);
4241 if (it == ctx_map.end ()) {
43- struct ggml_init_params params = {
42+ ggml_init_params params = {
4443 /* .mem_size =*/ hparams.n_layer *ggml_tensor_overhead (),
4544 /* .mem_buffer =*/ NULL ,
4645 /* .no_alloc =*/ true ,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
9190 return true ;
9291}
9392
94- int32_t llama_adapter_cvec::apply (
93+ bool llama_adapter_cvec::apply (
9594 const llama_model & model,
9695 const float * data,
9796 size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104103 // disable the current control vector (but leave allocated for later)
105104 layer_start = -1 ;
106105 layer_end = -1 ;
107- return 0 ;
106+ return true ;
108107 }
109108
110109 if (n_embd != (int ) hparams.n_embd ) {
111110 LLAMA_LOG_ERROR (" %s: control vector n_embd does not match model\n " , __func__);
112- return 1 ;
111+ return false ;
113112 }
114113
115114 if (tensors.empty ()) {
116115 if (!init (model)) {
117- return 1 ;
116+ return false ;
118117 }
119118 }
120119
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130129 }
131130 }
132131
133- return 0 ;
132+ return true ;
134133}
135134
136135// lora
137136
138- llama_adapter_lora_weight * llama_adapter_lora::get_weight (struct ggml_tensor * w) {
137+ llama_adapter_lora_weight * llama_adapter_lora::get_weight (ggml_tensor * w) {
139138 const std::string name (w->name );
140139
141140 const auto pos = ab_map.find (name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
146145 return nullptr ;
147146}
148147
149- static void llama_adapter_lora_init_impl (struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148+ static void llama_adapter_lora_init_impl (llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150149 LLAMA_LOG_INFO (" %s: loading lora adapter from '%s' ...\n " , __func__, path_lora);
151150
152151 ggml_context * ctx_init;
153- struct gguf_init_params meta_gguf_params = {
152+ gguf_init_params meta_gguf_params = {
154153 /* .no_alloc = */ true ,
155154 /* .ctx = */ &ctx_init,
156155 };
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201200 auto it = ctx_map.find (buft);
202201 if (it == ctx_map.end ()) {
203202 // add a new context
204- struct ggml_init_params params = {
203+ ggml_init_params params = {
205204 /* .mem_size =*/ n_tensors*ggml_tensor_overhead (),
206205 /* .mem_buffer =*/ NULL ,
207206 /* .no_alloc =*/ true ,
@@ -248,6 +247,26 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
248247 }
249248 }
250249
250+ // get extra buffer types of the CPU
251+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253+ std::vector<ggml_backend_buffer_type_t > buft_extra;
254+ {
255+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
256+ auto * cpu_reg = ggml_backend_dev_backend_reg (cpu_dev);
257+
258+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t )
259+ ggml_backend_reg_get_proc_address (cpu_reg, " ggml_backend_dev_get_extra_bufts" );
260+
261+ if (ggml_backend_dev_get_extra_bufts_fn) {
262+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn (cpu_dev);
263+ while (extra_bufts && *extra_bufts) {
264+ buft_extra.emplace_back (*extra_bufts);
265+ ++extra_bufts;
266+ }
267+ }
268+ }
269+
251270 // add tensors
252271 for (auto & it : ab_map) {
253272 const std::string & name = it.first ;
@@ -264,7 +283,23 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264283 throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model (hint: maybe wrong base model?)" );
265284 }
266285
267- struct ggml_context * dev_ctx = ctx_for_buft (ggml_backend_buffer_get_type (model_tensor->buffer ));
286+ auto * buft = ggml_backend_buffer_get_type (model_tensor->buffer );
287+
288+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
289+ for (auto & ex : buft_extra) {
290+ if (ex == buft) {
291+ LLAMA_LOG_WARN (" %s: lora for '%s' cannot use buft '%s', fallback to CPU\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
292+
293+ auto * cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
294+ buft = ggml_backend_dev_buffer_type (cpu_dev);
295+
296+ break ;
297+ }
298+ }
299+
300+ LLAMA_LOG_DEBUG (" %s: lora for '%s' -> '%s'\n " , __func__, model_tensor->name , ggml_backend_buft_name (buft));
301+
302+ ggml_context * dev_ctx = ctx_for_buft (buft);
268303 // validate tensor shape
269304 if (is_token_embd) {
270305 // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +316,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281316 }
282317
283318 // save tensor to adapter
284- struct ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
285- struct ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
319+ ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
320+ ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
286321 ggml_set_name (tensor_a, w.a ->name );
287322 ggml_set_name (tensor_b, w.b ->name );
288323 adapter.ab_map [name] = llama_adapter_lora_weight (tensor_a, tensor_b);
@@ -308,7 +343,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308343 {
309344 llama_file gguf_file (path_lora, " rb" );
310345 std::vector<uint8_t > read_buf;
311- auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
346+ auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
312347 size_t offs = gguf_get_data_offset (ctx_gguf.get ()) + gguf_get_tensor_offset (ctx_gguf.get (), gguf_find_tensor (ctx_gguf.get (), orig->name ));
313348 size_t size = ggml_nbytes (orig);
314349 read_buf.resize (size);
@@ -327,8 +362,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327362 LLAMA_LOG_INFO (" %s: loaded %zu tensors from lora file\n " , __func__, adapter.ab_map .size ()*2 );
328363}
329364
330- struct llama_adapter_lora * llama_adapter_lora_init (struct llama_model * model, const char * path_lora) {
331- struct llama_adapter_lora * adapter = new llama_adapter_lora ();
365+ llama_adapter_lora * llama_adapter_lora_init (llama_model * model, const char * path_lora) {
366+ llama_adapter_lora * adapter = new llama_adapter_lora ();
332367
333368 try {
334369 llama_adapter_lora_init_impl (*model, path_lora, *adapter);
@@ -342,6 +377,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342377 return nullptr ;
343378}
344379
345- void llama_adapter_lora_free (struct llama_adapter_lora * adapter) {
380+ void llama_adapter_lora_free (llama_adapter_lora * adapter) {
346381 delete adapter;
347382}
0 commit comments