11#include " llama-adapter.h"
22
3+ #include " llama-impl.h"
4+ #include " llama-mmap.h"
35#include " llama-model.h"
46
57#include < algorithm>
911
1012// vec
1113
12- struct ggml_tensor * llama_control_vector ::tensor_for (int il) const {
14+ struct ggml_tensor * llama_adapter_cvec ::tensor_for (int il) const {
1315 if (il < 0 || il < layer_start || il > layer_end || (size_t ) il >= tensors.size ()) {
1416 return nullptr ;
1517 }
1618
1719 return tensors[il];
1820}
1921
20- struct ggml_tensor * llama_control_vector ::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
22+ struct ggml_tensor * llama_adapter_cvec ::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
2123 ggml_tensor * layer_dir = tensor_for (il);
2224 if (layer_dir != nullptr ) {
2325 cur = ggml_add (ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
2628 return cur;
2729}
2830
29- static bool llama_control_vector_init ( struct llama_control_vector & cvec, const llama_model & model) {
31+ bool llama_adapter_cvec::init ( const llama_model & model) {
3032 const auto & hparams = model.hparams ;
3133
32- GGML_ASSERT (cvec. tensors .empty ());
33- GGML_ASSERT (cvec. ctxs .empty ());
34- GGML_ASSERT (cvec. bufs .empty ());
34+ GGML_ASSERT (tensors.empty ());
35+ GGML_ASSERT (ctxs.empty ());
36+ GGML_ASSERT (bufs.empty ());
3537
3638 // create a context for each buffer type
3739 std::map<ggml_backend_buffer_type_t , ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
5052 }
5153
5254 ctx_map[buft] = ctx;
53- cvec. ctxs .emplace_back (ctx);
55+ ctxs.emplace_back (ctx);
5456
5557 return ctx;
5658 }
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
5961 };
6062
6163 // make tensors
62- cvec. tensors .reserve (hparams.n_layer );
63- cvec. tensors .push_back (nullptr ); // there's never a tensor for layer 0
64+ tensors.reserve (hparams.n_layer );
65+ tensors.push_back (nullptr ); // there's never a tensor for layer 0
6466 for (size_t il = 1 ; il < hparams.n_layer ; il++) {
65- ggml_backend_buffer_type_t buft = llama_model_select_buft ( model, il);
67+ ggml_backend_buffer_type_t buft = model. select_buft ( il);
6668 ggml_context * ctx = ctx_for_buft (buft);
6769 if (!ctx) {
6870 LLAMA_LOG_ERROR (" %s: failed to allocate context for control vector\n " , __func__);
6971 return false ;
7072 }
7173 ggml_tensor * tensor = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd );
72- cvec. tensors .push_back (tensor);
74+ tensors.push_back (tensor);
7375 }
7476
7577 // allocate tensors / buffers and zero
76- cvec. bufs .reserve (ctx_map.size ());
78+ bufs.reserve (ctx_map.size ());
7779 for (auto it : ctx_map) {
7880 ggml_backend_buffer_type_t buft = it.first ;
7981 ggml_context * ctx = it.second ;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
8385 return false ;
8486 }
8587 ggml_backend_buffer_clear (buf, 0 );
86- cvec. bufs .emplace_back (buf);
88+ bufs.emplace_back (buf);
8789 }
8890
8991 return true ;
9092}
9193
92- int32_t llama_control_vector_apply (
93- struct llama_control_vector & cvec,
94+ int32_t llama_adapter_cvec::apply (
9495 const llama_model & model,
9596 const float * data,
9697 size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
101102
102103 if (data == nullptr ) {
103104 // disable the current control vector (but leave allocated for later)
104- cvec. layer_start = -1 ;
105- cvec. layer_end = -1 ;
105+ layer_start = -1 ;
106+ layer_end = -1 ;
106107 return 0 ;
107108 }
108109
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
111112 return 1 ;
112113 }
113114
114- if (cvec. tensors .empty ()) {
115- if (!llama_control_vector_init (cvec, model)) {
115+ if (tensors.empty ()) {
116+ if (!init ( model)) {
116117 return 1 ;
117118 }
118119 }
119120
120- cvec. layer_start = il_start;
121- cvec. layer_end = il_end;
121+ layer_start = il_start;
122+ layer_end = il_end;
122123
123124 for (size_t il = 1 ; il < hparams.n_layer ; il++) {
124- assert (cvec. tensors [il] != nullptr );
125+ assert (tensors[il] != nullptr );
125126
126127 const size_t off = n_embd * (il - 1 ); // buffer doesn't have data for layer 0, since it's never present
127128 if (off + n_embd <= len) {
128- ggml_backend_tensor_set (cvec. tensors [il], data + off, 0 , n_embd * ggml_element_size (cvec. tensors [il]));
129+ ggml_backend_tensor_set (tensors[il], data + off, 0 , n_embd * ggml_element_size (tensors[il]));
129130 }
130131 }
131132
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
134135
135136// lora
136137
137- llama_lora_weight * llama_lora_adapter ::get_weight (struct ggml_tensor * w) {
138+ llama_adapter_lora_weight * llama_adapter_lora ::get_weight (struct ggml_tensor * w) {
138139 const std::string name (w->name );
139140
140141 const auto pos = ab_map.find (name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
145146 return nullptr ;
146147}
147148
148- void llama_lora_adapter_free (struct llama_lora_adapter * adapter) {
149- delete adapter;
150- }
151-
152- static void llama_lora_adapter_init_impl (struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
149+ static void llama_adapter_lora_init_impl (struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
153150 LLAMA_LOG_INFO (" %s: loading lora adapter from '%s' ...\n " , __func__, path_lora);
154151
155152 ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
221218 };
222219
223220 // bundle lora_a and lora_b into pairs
224- std::map<std::string, llama_lora_weight > ab_map;
221+ std::map<std::string, llama_adapter_lora_weight > ab_map;
225222 auto str_endswith = [](const std::string & str, const std::string & suffix) {
226223 return str.size () >= suffix.size () && str.compare (str.size ()-suffix.size (), suffix.size (), suffix) == 0 ;
227224 };
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
231228 if (str_endswith (name, " .lora_a" )) {
232229 replace_all (name, " .lora_a" , " " );
233230 if (ab_map.find (name) == ab_map.end ()) {
234- ab_map[name] = llama_lora_weight (cur, nullptr );
231+ ab_map[name] = llama_adapter_lora_weight (cur, nullptr );
235232 } else {
236233 ab_map[name].a = cur;
237234 }
238235 } else if (str_endswith (name, " .lora_b" )) {
239236 replace_all (name, " .lora_b" , " " );
240237 if (ab_map.find (name) == ab_map.end ()) {
241- ab_map[name] = llama_lora_weight (nullptr , cur);
238+ ab_map[name] = llama_adapter_lora_weight (nullptr , cur);
242239 } else {
243240 ab_map[name].b = cur;
244241 }
242+ } else if (str_endswith (name, " _norm.weight" )) {
243+ // TODO: add support for norm vector
244+ // for now, we don't really care because most adapters still work fine without it
245+ continue ;
245246 } else {
246247 throw std::runtime_error (" LoRA tensor '" + name + " ' has unexpected suffix" );
247248 }
@@ -250,33 +251,41 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
250251 // add tensors
251252 for (auto & it : ab_map) {
252253 const std::string & name = it.first ;
253- llama_lora_weight & w = it.second ;
254+ llama_adapter_lora_weight & w = it.second ;
255+ bool is_token_embd = str_endswith (name, " token_embd.weight" );
254256
255257 if (!w.a || !w.b ) {
256258 throw std::runtime_error (" LoRA tensor pair for '" + name + " ' is missing one component" );
257259 }
258260
259261 // device buft and device ctx
260- auto * model_tensor = llama_model_get_tensor ( model, name.c_str ());
262+ const auto * model_tensor = model. get_tensor ( name.c_str ());
261263 if (!model_tensor) {
262- throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model" );
264+ throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model (hint: maybe wrong base model?) " );
263265 }
264266
265267 struct ggml_context * dev_ctx = ctx_for_buft (ggml_backend_buffer_get_type (model_tensor->buffer ));
266268 // validate tensor shape
267- if (model_tensor->ne [0 ] != w.a ->ne [0 ] || model_tensor->ne [1 ] != w.b ->ne [1 ]) {
268- throw std::runtime_error (" tensor '" + name + " ' has incorrect shape" );
269- }
270- if (w.a ->ne [1 ] != w.b ->ne [0 ]) {
271- throw std::runtime_error (" lora_a tensor is not transposed (hint: adapter from \" finetune\" example is no longer supported)" );
269+ if (is_token_embd) {
270+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271+ if (model_tensor->ne [0 ] != w.b ->ne [1 ] || model_tensor->ne [1 ] != w.a ->ne [1 ]) {
272+ throw std::runtime_error (" tensor '" + name + " ' has incorrect shape (hint: maybe wrong base model?)" );
273+ }
274+ } else {
275+ if (model_tensor->ne [0 ] != w.a ->ne [0 ] || model_tensor->ne [1 ] != w.b ->ne [1 ]) {
276+ throw std::runtime_error (" tensor '" + name + " ' has incorrect shape (hint: maybe wrong base model?)" );
277+ }
278+ if (w.a ->ne [1 ] != w.b ->ne [0 ]) {
279+ throw std::runtime_error (" lora_a tensor is not transposed (hint: adapter from \" finetune\" example is no longer supported)" );
280+ }
272281 }
273282
274283 // save tensor to adapter
275284 struct ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
276285 struct ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
277286 ggml_set_name (tensor_a, w.a ->name );
278287 ggml_set_name (tensor_b, w.b ->name );
279- adapter.ab_map [name] = llama_lora_weight (tensor_a, tensor_b);
288+ adapter.ab_map [name] = llama_adapter_lora_weight (tensor_a, tensor_b);
280289 }
281290
282291 // allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
318327 LLAMA_LOG_INFO (" %s: loaded %zu tensors from lora file\n " , __func__, adapter.ab_map .size ()*2 );
319328}
320329
321- struct llama_lora_adapter * llama_lora_adapter_init (struct llama_model * model, const char * path_lora) {
322- struct llama_lora_adapter * adapter = new llama_lora_adapter ();
330+ struct llama_adapter_lora * llama_adapter_lora_init (struct llama_model * model, const char * path_lora) {
331+ struct llama_adapter_lora * adapter = new llama_adapter_lora ();
323332
324333 try {
325- llama_lora_adapter_init_impl (*model, path_lora, *adapter);
334+ llama_adapter_lora_init_impl (*model, path_lora, *adapter);
326335 return adapter;
327336 } catch (const std::exception & err) {
328337 LLAMA_LOG_ERROR (" %s: failed to apply lora adapter: %s\n " , __func__, err.what ());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
332341
333342 return nullptr ;
334343}
344+
345+ void llama_adapter_lora_free (struct llama_adapter_lora * adapter) {
346+ delete adapter;
347+ }
0 commit comments