1
1
#include " llama-adapter.h"
2
2
3
+ #include " llama-impl.h"
4
+ #include " llama-mmap.h"
3
5
#include " llama-model.h"
4
6
5
7
#include < algorithm>
9
11
10
12
// vec
11
13
12
- struct ggml_tensor * llama_control_vector ::tensor_for (int il) const {
14
+ struct ggml_tensor * llama_adapter_cvec ::tensor_for (int il) const {
13
15
if (il < 0 || il < layer_start || il > layer_end || (size_t ) il >= tensors.size ()) {
14
16
return nullptr ;
15
17
}
16
18
17
19
return tensors[il];
18
20
}
19
21
20
- struct ggml_tensor * llama_control_vector ::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
22
+ struct ggml_tensor * llama_adapter_cvec ::apply_to (struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21
23
ggml_tensor * layer_dir = tensor_for (il);
22
24
if (layer_dir != nullptr ) {
23
25
cur = ggml_add (ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
26
28
return cur;
27
29
}
28
30
29
- static bool llama_control_vector_init ( struct llama_control_vector & cvec, const llama_model & model) {
31
+ bool llama_adapter_cvec::init ( const llama_model & model) {
30
32
const auto & hparams = model.hparams ;
31
33
32
- GGML_ASSERT (cvec. tensors .empty ());
33
- GGML_ASSERT (cvec. ctxs .empty ());
34
- GGML_ASSERT (cvec. bufs .empty ());
34
+ GGML_ASSERT (tensors.empty ());
35
+ GGML_ASSERT (ctxs.empty ());
36
+ GGML_ASSERT (bufs.empty ());
35
37
36
38
// create a context for each buffer type
37
39
std::map<ggml_backend_buffer_type_t , ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
50
52
}
51
53
52
54
ctx_map[buft] = ctx;
53
- cvec. ctxs .emplace_back (ctx);
55
+ ctxs.emplace_back (ctx);
54
56
55
57
return ctx;
56
58
}
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
59
61
};
60
62
61
63
// make tensors
62
- cvec. tensors .reserve (hparams.n_layer );
63
- cvec. tensors .push_back (nullptr ); // there's never a tensor for layer 0
64
+ tensors.reserve (hparams.n_layer );
65
+ tensors.push_back (nullptr ); // there's never a tensor for layer 0
64
66
for (size_t il = 1 ; il < hparams.n_layer ; il++) {
65
- ggml_backend_buffer_type_t buft = llama_model_select_buft ( model, il);
67
+ ggml_backend_buffer_type_t buft = model. select_buft ( il);
66
68
ggml_context * ctx = ctx_for_buft (buft);
67
69
if (!ctx) {
68
70
LLAMA_LOG_ERROR (" %s: failed to allocate context for control vector\n " , __func__);
69
71
return false ;
70
72
}
71
73
ggml_tensor * tensor = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hparams.n_embd );
72
- cvec. tensors .push_back (tensor);
74
+ tensors.push_back (tensor);
73
75
}
74
76
75
77
// allocate tensors / buffers and zero
76
- cvec. bufs .reserve (ctx_map.size ());
78
+ bufs.reserve (ctx_map.size ());
77
79
for (auto it : ctx_map) {
78
80
ggml_backend_buffer_type_t buft = it.first ;
79
81
ggml_context * ctx = it.second ;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
83
85
return false ;
84
86
}
85
87
ggml_backend_buffer_clear (buf, 0 );
86
- cvec. bufs .emplace_back (buf);
88
+ bufs.emplace_back (buf);
87
89
}
88
90
89
91
return true ;
90
92
}
91
93
92
- int32_t llama_control_vector_apply (
93
- struct llama_control_vector & cvec,
94
+ int32_t llama_adapter_cvec::apply (
94
95
const llama_model & model,
95
96
const float * data,
96
97
size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
101
102
102
103
if (data == nullptr ) {
103
104
// disable the current control vector (but leave allocated for later)
104
- cvec. layer_start = -1 ;
105
- cvec. layer_end = -1 ;
105
+ layer_start = -1 ;
106
+ layer_end = -1 ;
106
107
return 0 ;
107
108
}
108
109
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
111
112
return 1 ;
112
113
}
113
114
114
- if (cvec. tensors .empty ()) {
115
- if (!llama_control_vector_init (cvec, model)) {
115
+ if (tensors.empty ()) {
116
+ if (!init ( model)) {
116
117
return 1 ;
117
118
}
118
119
}
119
120
120
- cvec. layer_start = il_start;
121
- cvec. layer_end = il_end;
121
+ layer_start = il_start;
122
+ layer_end = il_end;
122
123
123
124
for (size_t il = 1 ; il < hparams.n_layer ; il++) {
124
- assert (cvec. tensors [il] != nullptr );
125
+ assert (tensors[il] != nullptr );
125
126
126
127
const size_t off = n_embd * (il - 1 ); // buffer doesn't have data for layer 0, since it's never present
127
128
if (off + n_embd <= len) {
128
- ggml_backend_tensor_set (cvec. tensors [il], data + off, 0 , n_embd * ggml_element_size (cvec. tensors [il]));
129
+ ggml_backend_tensor_set (tensors[il], data + off, 0 , n_embd * ggml_element_size (tensors[il]));
129
130
}
130
131
}
131
132
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
134
135
135
136
// lora
136
137
137
- llama_lora_weight * llama_lora_adapter ::get_weight (struct ggml_tensor * w) {
138
+ llama_adapter_lora_weight * llama_adapter_lora ::get_weight (struct ggml_tensor * w) {
138
139
const std::string name (w->name );
139
140
140
141
const auto pos = ab_map.find (name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
145
146
return nullptr ;
146
147
}
147
148
148
- void llama_lora_adapter_free (struct llama_lora_adapter * adapter) {
149
- delete adapter;
150
- }
151
-
152
- static void llama_lora_adapter_init_impl (struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
149
+ static void llama_adapter_lora_init_impl (struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
153
150
LLAMA_LOG_INFO (" %s: loading lora adapter from '%s' ...\n " , __func__, path_lora);
154
151
155
152
ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
221
218
};
222
219
223
220
// bundle lora_a and lora_b into pairs
224
- std::map<std::string, llama_lora_weight > ab_map;
221
+ std::map<std::string, llama_adapter_lora_weight > ab_map;
225
222
auto str_endswith = [](const std::string & str, const std::string & suffix) {
226
223
return str.size () >= suffix.size () && str.compare (str.size ()-suffix.size (), suffix.size (), suffix) == 0 ;
227
224
};
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
231
228
if (str_endswith (name, " .lora_a" )) {
232
229
replace_all (name, " .lora_a" , " " );
233
230
if (ab_map.find (name) == ab_map.end ()) {
234
- ab_map[name] = llama_lora_weight (cur, nullptr );
231
+ ab_map[name] = llama_adapter_lora_weight (cur, nullptr );
235
232
} else {
236
233
ab_map[name].a = cur;
237
234
}
238
235
} else if (str_endswith (name, " .lora_b" )) {
239
236
replace_all (name, " .lora_b" , " " );
240
237
if (ab_map.find (name) == ab_map.end ()) {
241
- ab_map[name] = llama_lora_weight (nullptr , cur);
238
+ ab_map[name] = llama_adapter_lora_weight (nullptr , cur);
242
239
} else {
243
240
ab_map[name].b = cur;
244
241
}
242
+ } else if (str_endswith (name, " _norm.weight" )) {
243
+ // TODO: add support for norm vector
244
+ // for now, we don't really care because most adapters still work fine without it
245
+ continue ;
245
246
} else {
246
247
throw std::runtime_error (" LoRA tensor '" + name + " ' has unexpected suffix" );
247
248
}
@@ -250,33 +251,41 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
250
251
// add tensors
251
252
for (auto & it : ab_map) {
252
253
const std::string & name = it.first ;
253
- llama_lora_weight & w = it.second ;
254
+ llama_adapter_lora_weight & w = it.second ;
255
+ bool is_token_embd = str_endswith (name, " token_embd.weight" );
254
256
255
257
if (!w.a || !w.b ) {
256
258
throw std::runtime_error (" LoRA tensor pair for '" + name + " ' is missing one component" );
257
259
}
258
260
259
261
// device buft and device ctx
260
- auto * model_tensor = llama_model_get_tensor ( model, name.c_str ());
262
+ const auto * model_tensor = model. get_tensor ( name.c_str ());
261
263
if (!model_tensor) {
262
- throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model" );
264
+ throw std::runtime_error (" LoRA tensor '" + name + " ' does not exist in base model (hint: maybe wrong base model?) " );
263
265
}
264
266
265
267
struct ggml_context * dev_ctx = ctx_for_buft (ggml_backend_buffer_get_type (model_tensor->buffer ));
266
268
// validate tensor shape
267
- if (model_tensor->ne [0 ] != w.a ->ne [0 ] || model_tensor->ne [1 ] != w.b ->ne [1 ]) {
268
- throw std::runtime_error (" tensor '" + name + " ' has incorrect shape" );
269
- }
270
- if (w.a ->ne [1 ] != w.b ->ne [0 ]) {
271
- throw std::runtime_error (" lora_a tensor is not transposed (hint: adapter from \" finetune\" example is no longer supported)" );
269
+ if (is_token_embd) {
270
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271
+ if (model_tensor->ne [0 ] != w.b ->ne [1 ] || model_tensor->ne [1 ] != w.a ->ne [1 ]) {
272
+ throw std::runtime_error (" tensor '" + name + " ' has incorrect shape (hint: maybe wrong base model?)" );
273
+ }
274
+ } else {
275
+ if (model_tensor->ne [0 ] != w.a ->ne [0 ] || model_tensor->ne [1 ] != w.b ->ne [1 ]) {
276
+ throw std::runtime_error (" tensor '" + name + " ' has incorrect shape (hint: maybe wrong base model?)" );
277
+ }
278
+ if (w.a ->ne [1 ] != w.b ->ne [0 ]) {
279
+ throw std::runtime_error (" lora_a tensor is not transposed (hint: adapter from \" finetune\" example is no longer supported)" );
280
+ }
272
281
}
273
282
274
283
// save tensor to adapter
275
284
struct ggml_tensor * tensor_a = ggml_dup_tensor (dev_ctx, w.a );
276
285
struct ggml_tensor * tensor_b = ggml_dup_tensor (dev_ctx, w.b );
277
286
ggml_set_name (tensor_a, w.a ->name );
278
287
ggml_set_name (tensor_b, w.b ->name );
279
- adapter.ab_map [name] = llama_lora_weight (tensor_a, tensor_b);
288
+ adapter.ab_map [name] = llama_adapter_lora_weight (tensor_a, tensor_b);
280
289
}
281
290
282
291
// allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
318
327
LLAMA_LOG_INFO (" %s: loaded %zu tensors from lora file\n " , __func__, adapter.ab_map .size ()*2 );
319
328
}
320
329
321
- struct llama_lora_adapter * llama_lora_adapter_init (struct llama_model * model, const char * path_lora) {
322
- struct llama_lora_adapter * adapter = new llama_lora_adapter ();
330
+ struct llama_adapter_lora * llama_adapter_lora_init (struct llama_model * model, const char * path_lora) {
331
+ struct llama_adapter_lora * adapter = new llama_adapter_lora ();
323
332
324
333
try {
325
- llama_lora_adapter_init_impl (*model, path_lora, *adapter);
334
+ llama_adapter_lora_init_impl (*model, path_lora, *adapter);
326
335
return adapter;
327
336
} catch (const std::exception & err) {
328
337
LLAMA_LOG_ERROR (" %s: failed to apply lora adapter: %s\n " , __func__, err.what ());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
332
341
333
342
return nullptr ;
334
343
}
344
+
345
+ void llama_adapter_lora_free (struct llama_adapter_lora * adapter) {
346
+ delete adapter;
347
+ }
0 commit comments